Explorar el Código

MNN:Sync: Sync Internal 2.9.5

xiaying hace 10 meses
padre
commit
1effb0c9e5
Se han modificado 100 ficheros con 4467 adiciones y 845 borrados
  1. 0 4
      3rd_party/OpenCLHeaders/CL/cl2.hpp
  2. 1 0
      CMakeLists.txt
  3. 1 1
      codegen/OpFuse.cpp
  4. 1 0
      docs/compile/cmake.md
  5. 53 37
      docs/contribute/backend.md
  6. 9 1
      docs/contribute/op.md
  7. 2 2
      docs/faq.md
  8. 1 1
      docs/index.rst
  9. 77 9
      docs/inference/module.md
  10. 2 2
      docs/start/overall.md
  11. 0 1
      docs/tools/convert.md
  12. 5 1
      docs/tools/quant.md
  13. 23 11
      docs/tools/test.md
  14. 11 12
      docs/transformers/diffusion.md
  15. 78 62
      docs/transformers/llm.md
  16. 2 3
      express/Executor.cpp
  17. 7 1
      express/module/Module.cpp
  18. 7 15
      express/module/StaticModule.cpp
  19. 12 5
      include/MNN/Interpreter.hpp
  20. 1 1
      include/MNN/MNNDefine.h
  21. 2 21
      package_scripts/ios/buildiOS.sh
  22. 42 0
      package_scripts/ios/buildiOS_with_armv7.sh
  23. 4 4
      package_scripts/mac/buildFrameWork.sh
  24. 0 1
      project/android/build_32.sh
  25. 86 2
      project/ios/MNN.xcodeproj/project.pbxproj
  26. 12 4
      pymnn/test/model_test.py
  27. 1 0
      source/backend/arm82/Arm82Backend.cpp
  28. 10 14
      source/backend/arm82/Arm82Functions.cpp
  29. 7 0
      source/backend/arm82/CMakeLists.txt
  30. 17 7
      source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuanInput_ARM82.S
  31. 3 6
      source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantAndReorder_ARM82.S
  32. 13 38
      source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
  33. 69 60
      source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
  34. 61 23
      source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
  35. 0 0
      source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int4.S
  36. 0 0
      source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int8.S
  37. 0 0
      source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int4.S
  38. 0 0
      source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int8.S
  39. 1 1
      source/backend/coreml/backend/CoreMLBackend.cpp
  40. 1 1
      source/backend/coreml/backend/CoreMLBackend.hpp
  41. 4 0
      source/backend/cpu/CMakeLists.txt
  42. 144 39
      source/backend/cpu/CPUAttention.cpp
  43. 7 2
      source/backend/cpu/CPUAttention.hpp
  44. 112 43
      source/backend/cpu/CPUBackend.cpp
  45. 17 7
      source/backend/cpu/CPUBackend.hpp
  46. 2 3
      source/backend/cpu/CPUCast.cpp
  47. 0 170
      source/backend/cpu/CPUConvolution.cpp
  48. 1 6
      source/backend/cpu/CPUConvolution.hpp
  49. 4 2
      source/backend/cpu/CPUDeconvolution.cpp
  50. 1 1
      source/backend/cpu/CPUDynamicQuant.cpp
  51. 2 2
      source/backend/cpu/CPUFloatToInt8.cpp
  52. 1 1
      source/backend/cpu/CPUFloatToInt8.hpp
  53. 0 1
      source/backend/cpu/CPUImageProcess.cpp
  54. 14 9
      source/backend/cpu/CPUProposal.cpp
  55. 8 1
      source/backend/cpu/CPUProposal.hpp
  56. 223 69
      source/backend/cpu/KVCacheManager.cpp
  57. 51 19
      source/backend/cpu/KVCacheManager.hpp
  58. 4 0
      source/backend/cpu/arm/CMakeLists.txt
  59. 33 0
      source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S
  60. 43 0
      source/backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S
  61. 46 0
      source/backend/cpu/arm/arm32/MNNBGRToBGR555Fast.S
  62. 51 0
      source/backend/cpu/arm/arm32/MNNBGRToBGR565Fast.S
  63. 46 0
      source/backend/cpu/arm/arm32/MNNBGRToGRAYFast.S
  64. 34 0
      source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S
  65. 95 0
      source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S
  66. 98 0
      source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S
  67. 33 10
      source/backend/cpu/arm/arm32/MNNFloat2Int8.S
  68. 35 0
      source/backend/cpu/arm/arm32/MNNGRAYToC3Fast.S
  69. 36 0
      source/backend/cpu/arm/arm32/MNNGRAYToC4Fast.S
  70. 2 1
      source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
  71. 2 0
      source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
  72. 2 1
      source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
  73. 38 0
      source/backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S
  74. 38 0
      source/backend/cpu/arm/arm32/MNNRGBAToBGRFast.S
  75. 43 0
      source/backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S
  76. 46 0
      source/backend/cpu/arm/arm32/MNNRGBToBGR555Fast.S
  77. 54 0
      source/backend/cpu/arm/arm32/MNNRGBToBGR565Fast.S
  78. 36 0
      source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S
  79. 43 0
      source/backend/cpu/arm/arm32/MNNRGBToGRAYFast.S
  80. 129 0
      source/backend/cpu/arm/arm64/MNNBGRAToBGR.S
  81. 92 0
      source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S
  82. 169 0
      source/backend/cpu/arm/arm64/MNNBGRToBGR555.S
  83. 187 0
      source/backend/cpu/arm/arm64/MNNBGRToBGR565.S
  84. 92 0
      source/backend/cpu/arm/arm64/MNNBGRToGRAY.S
  85. 116 0
      source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S
  86. 88 0
      source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S
  87. 92 0
      source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S
  88. 83 77
      source/backend/cpu/arm/arm64/MNNFloat2Int8.S
  89. 124 0
      source/backend/cpu/arm/arm64/MNNGRAYToC3Fast.S
  90. 139 0
      source/backend/cpu/arm/arm64/MNNGRAYToC4Fast.S
  91. 27 30
      source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
  92. 107 0
      source/backend/cpu/arm/arm64/MNNPackC2.S
  93. 147 0
      source/backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S
  94. 134 0
      source/backend/cpu/arm/arm64/MNNRGBAToBGRFast.S
  95. 96 0
      source/backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S
  96. 126 0
      source/backend/cpu/arm/arm64/MNNRGBToBGR.S
  97. 169 0
      source/backend/cpu/arm/arm64/MNNRGBToBGR555.S
  98. 187 0
      source/backend/cpu/arm/arm64/MNNRGBToBGR565.S
  99. 92 0
      source/backend/cpu/arm/arm64/MNNRGBToGRAYFast.S
  100. 0 0
      source/backend/cpu/arm/arm64/MNNSamplerC3BilinearOpt.S

+ 0 - 4
3rd_party/OpenCLHeaders/CL/cl2.hpp

@@ -403,10 +403,6 @@
 # pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
 # define CL_HPP_USE_CL_DEVICE_FISSION
 #endif
-#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
-# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
-# define CL_HPP_ENABLE_EXCEPTIONS
-#endif
 #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
 # pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
 # define CL_HPP_NO_STD_VECTOR

+ 1 - 0
CMakeLists.txt

@@ -63,6 +63,7 @@ option(MNN_INTERNAL "Build with MNN internal features, such as model authenticat
 option(MNN_JNI "Build MNN Jni for java to use" OFF)
 option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
 option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
+option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
 
 IF (OHOS)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)

+ 1 - 1
codegen/OpFuse.cpp

@@ -275,7 +275,7 @@ bool codegen(std::vector<Schedule::OpCacheInfo>& infos, std::vector<std::vector<
         auto inputs = tensors.first;
         auto outputs = tensors.second;
         // build Plugin Op
-        SharedPtr<Command> cmdPlugin;
+        std::shared_ptr<Command> cmdPlugin;
         {
             auto sourceCode = fuseModule.codegen();
             if(mapKernelSources.find(sourceCode) == mapKernelSources.end()) {

+ 1 - 0
docs/compile/cmake.md

@@ -80,6 +80,7 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
 | MNN_OPENCV_BENCH     | 构建MNN的OpenCV功能是否开启性能benchmark,默认为`OFF` |
 | MNN_VULKAN_IMAGE     | 构建MNN的Vulkan后端时采用Image内存模式,以便支持FP16和部分移动端上GPU的加速,默认为`ON` |
 | MNN_LOW_MEMORY       | 是否支持低内存模式,支持低内存模式使用权值量化模型并设置`low_memory`则会使用计算时反量化,默认为`OFF` |
+| MNN_CPU_WEIGHT_DEQUANT_GEMM       | 是否编译CPU权重反量化的矩阵乘Kernel, 如果打开该编译宏并且在CPU推理时设置MNN::BackendConfig::MemoryMode=Memory_Normal,就会使用权重反量化算子进行权重量化模型的推理,默认为`OFF` |
 | MNN_SUPPORT_RENDER   | 是否支持图形渲染相关算子实现,默认为 `OFF` |
 | MNN_SUPPORT_TRANSFORMER_FUSE | 是否支持Fuse Transformer相关OP实现,默认为 `OFF` |
 | MNN_BUILD_LLM        | 是否构建基于MNN的llm库和demo,默认为`OFF` |

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 53 - 37
docs/contribute/backend.md


+ 9 - 1
docs/contribute/op.md

@@ -1,6 +1,14 @@
 # 自定义算子
 ## 概述
-在添加自定义算子前,请参阅[算子列表](../en/ops),避免不必要的重复。
+在添加自定义算子前,请查看算子列表,避免不必要的重复。
+
+```bash
+./MNNConvert -f CAFFE --OP
+./MNNConvert -f TF --OP
+./MNNConvert -f ONNX --OP
+./MNNConvert -f TORCH --OP 
+```
+
 ### MNN 算子转换与实现结构
 MNN 的算子转换与实现如下图,
 - 模型转换包括以下步骤,二选一:

+ 2 - 2
docs/faq.md

@@ -250,7 +250,7 @@ OpenCL / Vulkan 采用静态变量自注册的方式往 MNN 主库注册后端.
 
 
 ## 性能相关
-### 使用 GPU 时,调用 copyToHostTensor / copyFromHostTensor 非常慢
+### 使用 GPU 时,调用 copyToHostTensor / readMap 非常慢
 GPU 后端调用 copy 的时间包含两个部分
 
 - 异构数据拷贝
@@ -258,7 +258,7 @@ GPU 后端调用 copy 的时间包含两个部分
 
 对 GPU 后端而言,在数据被要求对用户可见(比如复制 output tensor 数据出来)之前,是允许异步执行的。
 在数据被用户要求可见之时,会等待相应的异步操作完成。
-因此有可能 复制 output tensor 的过程包括了等待 GPU 算子异步执行完成,导致缓慢。
+因此有可能 复制 output tensor 的过程包括了等待 GPU 算子异步执行完成,导致看上去缓慢。
 ### GPU 为什么比 CPU 跑得慢?
 有如下原因: 
 

+ 1 - 1
docs/index.rst

@@ -72,7 +72,7 @@
 
 .. toctree::
    :maxdepth: 1
-   :caption: 测试工具
+   :caption: 工具
    :name: tools
 
    tools/convert

+ 77 - 9
docs/inference/module.md

@@ -5,19 +5,25 @@
 - 模型推理与`Session`的区别是不需要用户显式resize,支持控制流,所以当模型中有`if`或`while`时必须使用`Module`推理
 ### 相关数据结构
 - `Module` Module接口的核心类,表示一个模型的虚类;实际加载模型时会创建其子类
-- `Executor` 包含若干个`RuntimeManager`,提供内存管理接口,每个`Executor`必须在单线程环境下运行。默认提供全局 `Executor`,需要并发执行时,可自行创建。
-- `ExecutorScope`  用于在子线程中绑定`Executor`,多线程并发必需
-- `VARP` 作为`Module`的输入输出,也是[Expr API](expr.md)中的基础数据结构
+- `Executor` 提供内存管理和后端资源管理能力,每个`Executor`必须在单线程环境下运行。同一个`Executor`可以用于多个顺序执行的`Module`
+- `ExecutorScope`  用于在子线程中绑定`Executor`,多线程并发必需。默认在创建`Module`时使用全局 `Executor`,如果有多个Module在不同线程并发执行时,需要各自创建`Executor`,并用`ExecutorScope`绑定。
+- `VARP` `Module`的输入输出,也是[Expr API](expr.md)中的基础数据结构
 
 ## 工作流程
-配置Executor(可选) -> 创建 RuntimeManager(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module
-### (可选)配置Executor
-`Executor`给用户提供接口来配置推理后端、线程数等属性,以及做性能统计、算子执行的回调函数、内存回收等功能。 提供一个全局的Exector对象,用户不用创建或持有对象即可直接使用
+创建和配置Executor -> 创建 RuntimeManager(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module -> 销毁Executor
+### 创建和配置Executor
+`Executor`给用户提供接口来配置推理后端、线程数等属性,以及做性能统计、算子执行的回调函数、内存回收等功能。 推荐针对自身模块创建单独的Executor ,若使用全局的Exector对象,对于多个模块在不同线程运行时可能会发生冲突
 ```cpp
-// 配置默认全局Exector
-MNN::BackendConfig backend_config;    // default backend config 
+// 创建Exector
+MNN::BackendConfig backendConfig;    // default backend config 
+std::shared_ptr<MNN::Express::Executor> executor = MNN::Express::Executor::newExecutor(MNN_FORWARD_CPU, backendConfig, 1);
+
 // 设置使用4线程+CPU
-MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, backend_config, 4);
+executor->setGlobalExecutorConfig(MNN_FORWARD_CPU, backend_config, 4);
+
+// 绑定Executor,在创建/销毁/使用Module或进行表达式计算之前都需要绑定
+MNN::Express::ExecutorScope _s(executor);
+
 ``` 
 
 ### (可选)创建 RuntimeManager
@@ -39,6 +45,68 @@ std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtmgr(MNN::Express::Exec
 rtmgr->setCache(".cachefile");
 ```
 
+RuntimeManager 可以设置 hint , mode , cache, externalpath ,以支持扩展功能。
+
+```
+void setCache(std::string cacheName);
+void updateCache();
+void setMode(Interpreter::SessionMode mode);
+void setHint(Interpreter::HintMode mode, int value);
+void setExternalPath(std::string path, int type);
+bool getInfo(Interpreter::SessionInfoCode code, void* ptr);
+```
+
+#### cache 设置
+对于GPU后端(Metal/OpenCL等),可以设置缓存文件路径,存储AutoTuning结果和Program编译结果,以加速第二次之后的Module load 过程。
+
+```
+    std::shared_ptr<Executor::RuntimeManager> rtmgr(Executor::RuntimeManager::createRuntimeManager(config));
+    rtmgr->setCache(cacheFileName);
+
+    std::shared_ptr<Module> module(Module::load(inputNames, outputNames, modelName.c_str(), rtmgr, mdConfig));
+    /*... Make Inputs*/
+    auto outputs = module->onForward(inputs);
+
+    // Update cache file
+    rtmgr->updateCache();
+```
+
+#### mode 设置
+可以通过设置mode开启/关闭一些功能,示例:
+
+```
+// 创建出来的 Module 支持插入回调函数
+rtmgr->setMode(Interpreter::Session_Debug);
+```
+
+并非所有枚举都适用 Module 的创建,有效值如下:
+
+- Interpreter::SessionMode::Session_Debug : 支持逐算子调试
+- Interpreter::SessionMode::Session_Release : 关闭逐算子调试功能,可以轻微提升性能【默认选项】
+- Interpreter::SessionMode::Session_Backend_Fix : 固定使用用户设置的后端【默认选项】
+- Interpreter::SessionMode::Session_Backend_Auto : MNN根据用户倾向,预估load Module耗时,如果耗时较短则使用用户设置的后端,否则使用CPU
+
+
+#### hint 设置
+通过 hint 设置,可以在后端支持的情况下设置相应属性,有效值如下:
+
+- Interpreter::HintMode::WINOGRAD_MEMORY_LEVEL :使用 Winograd 算法优化卷积时,内存占用倾向,默认为 3 ,若希望降低内存占用可设为 0 
+- Interpreter::HintMode::GEOMETRY_COMPUTE_MASK :几何计算相关优化开关,1为区域合并,2为复合区域合并,4为使用loop算子,8为支持几何计算重计算,需要多个功能开启时把对应值叠加。默认为功能全开。
+- Interpreter::HintMode::DYNAMIC_QUANT_OPTIONS :动态量化选项,1为 Per Batch,2为Per Tensor 。默认为2。
+- Interpreter::HintMode::CPU_LITTLECORE_DECREASE_RATE :对于 Android 设备存在大中小核的情况,大核算力到中核算力的衰减比例。默认为50(中核算力为大核的50%)
+
+
+#### ExternalPath
+在设备可能出现内存不足时,可以通过 setExternalPath 指定路径,让MNN把部分内存用mmap分配。这样操作系统可在内存不足时会将其转换为读写文件,避免内存不足程序闪退。示例:
+
+```
+runtime_manager_->setExternalPath("tmp", MNN::Interpreter::EXTERNAL_WEIGHT_DIR);
+runtime_manager_->setExternalPath("tmp", MNN::Interpreter::EXTERNAL_FEATUREMAP_DIR);
+```
+
+- MNN::Interpreter::EXTERNAL_WEIGHT_DIR : 权重重排后的内存转换为文件存储
+- MNN::Interpreter::EXTERNAL_FEATUREMAP_DIR : 中间内存转换为文件存储
+
 ### 创建Module
 `Module`可以通过指定模型,输入输出的名称,配置文件创建
 ```cpp

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 2 - 2
docs/start/overall.md


+ 0 - 1
docs/tools/convert.md

@@ -1,5 +1,4 @@
 # 模型转换工具
-[从源码编译](../compile/tools.html#id2)
 ## 参数说明
 ```bash
 Usage:

+ 5 - 1
docs/tools/quant.md

@@ -1,7 +1,7 @@
 # 单输入模型离线量化工具
 `./quantized.out origin.mnn quan.mnn imageInputConfig.json`
 
-通用(任意输入个数、维度、类型)模型离线量化请看[说明](https://mnn-docs.readthedocs.io/en/latest/tools/compress.html#id10)
+MNN quantized.out工具已支持通用(任意输入个数、维度、类型)模型离线量化, 但这里的多输入模型仅仅支持非图片输入类模型。
 
 MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress,请查看[文档](https://mnn-docs.readthedocs.io/en/latest/tools/compress.html)选择使用
 
@@ -38,6 +38,10 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress,请查
 | MAX_ABS | 使用权值的绝对值的最大值进行对称量化 |
 | ADMM | 使用ADMM方法进行权值量化 |
 
+## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
+| input_type | `str` | 输入数据的类型,"sequence" |
+| path | `str` | 存放校正特征量化系数的输入数据目录 |,例如该目录下包含2个输入数据集input_0和input_1,子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹,可使用 testMNNFromOnnx.py 等脚本生成,参考模型转换的正确性校验部分。
+
 ## 量化模型的使用
 和浮点模型同样使用方法,输入输出仍然为浮点类型
 ## 参考资料

+ 23 - 11
docs/tools/test.md

@@ -1,5 +1,5 @@
 # 测试工具
-[从源码编译](../compile/tools.html#id4)使用cmake编译时,build目录下的产物也包含测试使用的工具集,下面逐项说明。
+使用cmake编译时,默认打开 MNN_BUILD_TOOLS 编译宏,对应build目录下的产物也包含测试使用的工具集,下面逐项说明。
 
 ## GetMNNInfo
 ### 功能
@@ -95,6 +95,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
 - 128 : 使用文件夹下面的 input.mnn 和 output.mnn 做为输入和对比输出,对于数据量较大的情况宜用此方案
 - 512 : 开启使用Winograd算法计算卷积时的内存优化,开启后模型的运行时内存会降低,但可能导致性能损失。
 - 1024: 使用动态量化推理时,对输入数据分batch量化以提高模型的推理精度
+- 2048: 使用mmap方式,使用文件存储中间内存。存储文件的目录为当前目录/tmp,需要先建tmp文件夹
 
 
 ### 示例
@@ -262,19 +263,10 @@ stopOp.c_str()=s  in main, 278
 Correct ! Run second pass
 Correct !
 ```
-### 在Android中使用
-先编译相关的库和可执行文件,然后push到Android手机上,用adb执行命令,参考`project/android/testCommon.sh`
-```bash
-cd project/android
-mkdir build_64
-cd build_64 && ../build_64.sh
-../updateTest.sh
-../testCommon.sh ./backendTest.out temp.mnn 3 0.15 1
-```
 
 ## getPerformance
 ### 功能
-获取当前设备的CPU性能,打印出每个CPU核心的频率;在Android设备上还会打印该设备CPU的浮点计算能力(GFLOPS)
+获取当前设备的CPU性能和内存访问性能,打印出每个CPU核心的频率;在Android设备上还会打印该设备CPU的浮点计算能力(GFLOPS)
 
 *各核心频率仅在Linux/Android环境中有效,计算能力仅在Android中有效*
 ### 参数
@@ -475,6 +467,7 @@ Matrix:
 ### 示例
 ```bash
 $ ./fuseTest user.spirv user.json
+```
 
 ## GpuInterTest.out
 ### 功能
@@ -488,3 +481,22 @@ GPU 内存输入测试用例
 - `forwardType:int` 执行推理的计算设备,有效值为:0(CPU)、1(Metal)、2(CUDA)、3(OpenCL)、6(OpenGL),7(Vulkan) ,9 (TensorRT),可选,默认为`0`
 - `numberThread:int` GPU的线程数,可选,默认为`1`
 - `precision_memory:int` 测试精度与内存模式,precision_memory % 16 为精度,有效输入为:0(Normal), 1(High), 2(Low), 3(Low_BF16),可选,默认为`2` ; precision_memory / 16 为内存设置,默认为 0 (memory_normal) 。例如测试 memory 为 2(low) ,precision 为 1 (high) 时,设置 precision_memory = 9 (2 * 4 + 1)
+
+
+## 在Android中使用测试工具
+- project/android/updateTest.sh 可以把编译好的库和可执行文件 push 到Android手机的/data/local/tmp/MNN 目录
+- project/android/testCommon.sh 可以在 /data/local/tmp/MNN 目录下执行可执行程序
+
+其他的资源文件需要自行使用 adb push ,将其放到手机的 /data/local/tmp/MNN 目录下,比如 adb push temp.mnn /data/local/tmp/MNN/temp.mnn
+
+如下例子是在Android设备上使用 backendTest.out ,其中 temp.mnn 路径为 /data/local/tmp/MNN/temp.mnn
+
+```bash
+cd project/android
+mkdir build_64
+cd build_64 && ../build_64.sh
+../updateTest.sh
+../testCommon.sh ./backendTest.out temp.mnn 3 0.15 1
+```
+
+

+ 11 - 12
docs/transformers/diffusion.md

@@ -17,8 +17,8 @@ https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/tree/mai
 ## 模型转换
 ### 将Huggingface的Stable Diffusion模型 转为onnx模型
 ```sh
-cd mnn_path/transformers/diffusion/
-python export/onnx_export.py \
+cd mnn_path/transformers/diffusion/export
+python onnx_export.py \
     --model_path hf_sd_load_path \
     --output_path onnx_save_path
 ```
@@ -30,20 +30,19 @@ conda activate ldm
 在conda环境中执行模型转换脚本
 
 ### 将onnx模型转为mnn模型
-新建diffusion mnn模型文件夹,将转好的mnn文件放在该文件夹下。
-1. 实现encoder从onnx模型 -> mnn模型
-```
-./MNNConvert -f ONNX --modelFile onnx_save_path/text_encoder/model.onnx --MNNModel mnn_save_path/text_encoder.mnn --weightQuantBits 8 --bizCode biz
-```
-2. 实现denoiser unet从onnx模型 -> mnn模型
+新建diffusion mnn模型文件夹 mnn_save_path ,将转好的mnn文件放在该文件夹下。
+
+执行脚本
 ```
-./MNNConvert -f ONNX --modelFile onnx_save_path/unet/model.onnx --MNNModel mnn_save_path/unet.mnn --transformerFuse --weightQuantBits 8 --bizCode biz
-注意:对于非OpenCL后端推理,需要去掉--transformerFuse。
+python3 convert_mnn.py ../onnx ~/alicnn/AliNNPrivate/build/diffusion "--weightQuantBits=8"
 ```
-3. 实现decoder从onnx模型 -> mnn模型
+
+若希望在OpenCL后端上进一步加速,可加上--transformerFuse:
 ```
-./MNNConvert -f ONNX --modelFile onnx_save_path/vae_decoder/model.onnx --keepInputFormat --MNNModel mnn_save_path/vae_decoder.mnn --weightQuantBits 8 --bizCode biz
+# 适用OpenCL 后端推理
+python3 convert_mnn.py onnx_path mnn_save_path "--weightQuantBits=8 --transformerFuse"
 ```
+
 ## 编译Diffusion Demo
 ### Linux/MAC/Windows上
 ```

+ 78 - 62
docs/transformers/llm.md

@@ -6,68 +6,59 @@
 
 ## 模型导出
 
-`llm_export`是一个llm模型导出工具,能够将llm模型导出为onnx和mnn模型。
+`llmexport`是一个llm模型导出工具,能够将llm模型导出为onnx和mnn模型。
 
 ### 用法
 1. 将需要导出的LLM项目clone到本地,如:Qwen2-0.5B-Instruct
 ```sh
 git clone https://www.modelscope.cn/qwen/Qwen2-0.5B-Instruct.git
 ```
-3. 执行`llm_export.py`导出模型
+3. 执行`llmexport.py`导出模型
 ```sh
 cd ./transformers/llm/export
 # 导出模型,tokenizer和embedding,并导出对应的mnn模型
-python llm_export.py \
-        --type Qwen2-0_5B-Instruct \
+python llmexport.py \
         --path /path/to/Qwen2-0.5B-Instruct \
-        --export \
-        --export_token \
-        --export_embed --embed_bin \
-        --export_mnn
+        --export mnn
 ```
 4. 导出产物
 导出产物为:
-1. `embeddings_bf16.bin`: 模型的embedding权重二进制文件,推理时使用;
-2. `llm_config.json`: 模型的配置信息,推理时使用;
-3. `llm.onnx`: 模型的onnx文件,推理时不使用;
-4. `tokenizer.txt`: 模型的tokenzier文件,推理时使用;
-5. `llm.mnn`: 模型的mnn文件,推理时使用;
-6. `llm.mnn.weight`: 模型的mnn权重,推理时使用;
+1. `config.json`: 模型运行时的配置,可手动修改;
+2. `embeddings_bf16.bin`: 模型的embedding权重二进制文件,推理时使用;
+3. `llm.mnn`: 模型的mnn文件,推理时使用;
+4. `llm.mnn.json`: mnn模型对应的json文件,apply_lora或者gptq量化权重时使用;
+5. `llm.mnn.weight`: 模型的mnn权重,推理时使用;
+6. `llm.onnx`: 模型的onnx文件,不包含权重,推理时不使用;
+7. `llm_config.json`: 模型的配置信息,推理时使用;
+8. `tokenizer.txt`: 模型的tokenzier文件,推理时使用;
 目录结构如下所示:
 ```
 .
-├── onnx
-|    ├── embeddings_bf16.bin
-|    ├── llm_config.json
-|    ├── llm.onnx
-|    └── tokenizer.txt
-└── mnn
+└── model
+     ├── config.json
+     ├── embeddings_bf16.bin
      ├── llm.mnn
-     └── llm.mnn.weight
+     ├── llm.mnn.json
+     ├── llm.mnn.weight
+     ├── llm.onnx
+     ├── llm_config.json
+     └── tokenizer.txt
 ```
 
 ### 功能
-- 支持将模型完整导出为一个onnx模型,使用`--export`
-- 支持将模型分段导出为多个模型,使用`--export_split`
-- 支持导出模型的词表到一个文本文件,每行代表一个token;其中token使用base64编码;使用`--export_verbose`
-- 支持导出模型的Embedding层为一个onnx模型,使用`--export_embed`,同时支持bf16格式,使用`--embed_bf16`
-- 支持分层导出模型的block,使用`--export_blocks`导出全部层;使用`--export_block $id`导出指定层
-- 支持导出模型的lm_head层为一个onnx模型,使用`--export_lm`
-- 支持导出多模态模型的visual模型为一个onnx模型,使用`--export_visual`
+- 支持将模型为onnx或mnn模型,使用`--export onnx`或`--export mnn`
 - 支持对模型进行对话测试,使用`--test $query`会返回llm的回复内容
-- 支持在导出onnx模型后使用onnxruntime对结果一致性进行校验,使用`--export_test`
-- 支持将tokenizer导出为文本文件,使用`--export_token`
-- 支持将导出的onnx模型转换为mnn模型,默认转换为非对称4bit量化,使用`--export_mnn`
-- 指定导出路径使用`--onnx_path`和`--mnn_path`
 - 默认会使用onnx-slim对onnx模型进行优化,跳过该步骤使用`--skip_slim`
 - 支持合并lora权重后导出,指定lora权重的目录使用`--lora_path`
+- 制定量化bit数使用`--quant_bit`;量化的block大小使用`--quant_block`
+- 使用`--lm_quant_bit`来制定lm_head层权重的量化bit数,不指定则使用`--quant_bit`的量化bit数
+- 支持使用自己编译的`MNNConvert`,使用`--mnnconvert`
 
 ### 参数
 ```
-usage: llm_export.py [-h] --path PATH
-                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}]
-                     [--lora_path LORA_PATH] [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token]
-                     [--export_embed] [--export_visual] [--export_lm] [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bin] [--embed_bf16] [--skip_slim]
+usage: llmexport.py [-h] --path PATH [--type TYPE] [--lora_path LORA_PATH] [--dst_path DST_PATH] [--test TEST] [--export EXPORT]
+                    [--skip_slim] [--quant_bit QUANT_BIT] [--quant_block QUANT_BLOCK] [--lm_quant_bit LM_QUANT_BIT]
+                    [--mnnconvert MNNCONVERT]
 
 llm_exporter
 
@@ -77,33 +68,22 @@ options:
                         Can be either:
                         	- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
                         	- A path to a *directory* clone from repo like `../chatglm-6b`.
-  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}
-                        type(`str`, *optional*):
+  --type TYPE           type(`str`, *optional*):
                         	The pretrain llm model type.
   --lora_path LORA_PATH
                         lora path, defaut is `None` mean not apply lora.
-  --onnx_path ONNX_PATH
-                        export onnx model path, defaut is `./onnx`.
-  --mnn_path MNN_PATH   export mnn model path, defaut is `./mnn`.
-  --export_mnn          Whether or not to export mnn model after onnx.
-  --export_verbose      Whether or not to export onnx with verbose.
-  --export_test         Whether or not to export onnx with test using onnxruntime.
+  --dst_path DST_PATH   export onnx/mnn model to path, defaut is `./model`.
   --test TEST           test model inference with query `TEST`.
-  --export              export model to an `onnx` model.
-  --export_split        export model split to some `onnx` models:
-                        	- embedding model.
-                        	- block models.
-                        	- lm_head model.
-  --export_token        export llm tokenizer to a txt file.
-  --export_embed        export llm embedding to an `onnx` model.
-  --export_visual       export llm visual model to an `onnx` model.
-  --export_lm           export llm lm_head to an `onnx` model.
-  --export_block EXPORT_BLOCK
-                        export llm block [id] to an `onnx` model.
-  --export_blocks       export llm all blocks to `onnx` models.
-  --embed_bin           export embedding weight as bin file with dtype `bfloat16`
-  --embed_bf16          using `bfloat16` replace `float32` in embedding.
+  --export EXPORT       export model to an onnx/mnn model.
   --skip_slim           Whether or not to skip onnx-slim.
+  --quant_bit QUANT_BIT
+                        mnn quant bit, 4 or 8, default is 4.
+  --quant_block QUANT_BLOCK
+                        mnn quant block, default is 0 mean channle-wise.
+  --lm_quant_bit LM_QUANT_BIT
+                        mnn lm_head quant bit, 4 or 8, default is `quant_bit`.
+  --mnnconvert MNNCONVERT
+                        local mnnconvert path, if invalid, using pymnn.
 ```
 
 ## 模型推理
@@ -111,6 +91,37 @@ options:
 ### 编译
 
 [从源码编译](../compile/other.html#id4)
+在原有编译过程中增加必需编译宏即可: -DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true 
+
+- mac / linux / windows
+
+以 mac / linux 为例 :
+```
+make build
+cd build
+cmake ../ -DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true
+make -j16
+```
+
+x86架构额外加 MNN_AVX512 的宏:
+```
+make build
+cd build
+cmake ../ -DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_AVX512=true
+make -j16
+```
+
+- Android:额外增加 MNN_ARM82 的宏
+```
+cd project/android
+mkdir build_64
+../build_64.sh "-DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_ARM82=true"
+```
+
+- iOS: 参考 transformers/llm/engine/ios/README.md
+```
+sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_BUILD_LLM=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true"
+```
 
 ### 使用
 #### 运行时配置
@@ -144,11 +155,16 @@ options:
 - 推理配置
   - max_new_tokens: 生成时最大token数,默认为`512`
   - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`,默认为`false`
-  - quant_kv: 存储`kv cache`时是否量化,可选为:`0, 1, 2, 3`,默认为`0`,含义如下:
+  - quant_qkv: CPU attention 算子中`query, key, value`是否量化,可选为:`0, 1, 2, 3, 4`,默认为`0`,含义如下:
     - 0: key和value都不量化
     - 1: 使用非对称8bit量化存储key
-    - 2: 使用fp8格式寸处value
-    - 3: 使用非对称8bit量化存储key,使用fp8格式寸处value
+    - 2: 使用fp8格式量化存储value
+    - 3: 使用非对称8bit量化存储key,使用fp8格式量化存储value
+    - 4: 量化kv的同时使用非对称8bit量化query,并使用int8矩阵乘计算Q*K
+  - use_mmap: 是否使用mmap方式,在内存不足时将权重写入磁盘,避免溢出,默认为false,手机上建议设成true
+  - kvcache_mmap: 是否使用mmap方式,在内存不足时将在KV Cache 写入磁盘,避免溢出,默认为false
+  - tmp_path: 启用 mmap 相关功能时,写入磁盘的缓存目录
+    - iOS 上可用如下语句创建临时目录并设置:`NSString *tempDirectory = NSTemporaryDirectory();llm->set_config("{\"tmp_path\":\"" + std::string([tempDirectory UTF8String]) + "\"}")`
 - 硬件配置
   - backend_type: 推理使用硬件后端类型,默认为:`"cpu"`
   - thread_num: CPU推理使用硬件线程数,默认为:`4`; OpenCL推理时使用`68`
@@ -266,4 +282,4 @@ options:
       thread1.join();
       thread2.join();
   }
-  ```
+  ```

+ 2 - 3
express/Executor.cpp

@@ -154,9 +154,8 @@ std::shared_ptr<Executor> Executor::getGlobalExecutor() {
         RuntimeHint hint;
         hint.memoryAllocatorType = 0;// Defer
         bn->setRuntimeHint(hint);
-        static std::shared_ptr<Executor> executorStatic;
-        executorStatic.reset(new Executor(bn, MNN_FORWARD_CPU, 1));
-        gExecutor = &executorStatic;
+        gExecutor = new std::shared_ptr<Executor>;
+        gExecutor->reset(new Executor(bn, MNN_FORWARD_CPU, 1));
     });
     return *gExecutor;
 }

+ 7 - 1
express/module/Module.cpp

@@ -330,11 +330,17 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
     if (nullptr == rtMgr.get()) {
         rtMgr.reset(_createDefaultRuntimeManager(config));
     }
+    bool needReset = false;
     if (rtMgr->getInside()->mExternalFile.empty()) {
         // Set Default externalFile
         rtMgr->setExternalFile(std::string(fileName) + ".weight");
+        needReset = true;
     }
-    return loadInternal(inputs, outputs, buffer.get(), buffer.size(), rtMgr, config);
+    auto res = loadInternal(inputs, outputs, buffer.get(), buffer.size(), rtMgr, config);
+    if (needReset) {
+        rtMgr->setExternalFile("");
+    }
+    return res;
 }
 
 Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> _rtMgr, const Module::Config* config) {

+ 7 - 15
express/module/StaticModule.cpp

@@ -33,7 +33,7 @@ static const StaticModule* getStaticModule(const Module* m) {
 }
 
 static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLINT
-                                                                       Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend, const Module* base = nullptr) {
+                                                                       Schedule::ScheduleInfo& scheduleInfo, Backend* firstbackend, Backend* backupBackend, const Module* base = nullptr) {
     std::map<const std::string, std::shared_ptr<Execution>> base_executions;
     if (base != nullptr) {
         // has base module
@@ -59,6 +59,10 @@ static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLIN
         auto op    = pipelineInfo[i].op;
         std::unique_ptr<OpT> op_table(op->UnPack());
         std::shared_ptr<Execution> exe;
+        Backend* backend = firstbackend;
+        if (info.type == Schedule::CONSTANT) {
+            backend = backupBackend;
+        }
         switch (op->type()) {
             case MNN::OpType_DepthwiseConvInt8:
             case MNN::OpType_ConvInt8:
@@ -304,20 +308,8 @@ StaticModule::StaticModule(std::vector<int> inputs,
     std::map<const Op*, std::pair<std::shared_ptr<Execution>, DataType>> exeCache;
     MNN_ASSERT(1 == scheduleInfo.pipelineInfo.size());
     auto& bnCache = scheduleInfo.pipelineInfo[0].first;
-    bnCache.cache.first.reset(rt.first[bnCache.info.type]->onCreate(bnCache.info.user));
-    if (bnCache.cache.first->type() == MNN_FORWARD_CPU) {
-        bnCache.cache.second = bnCache.cache.first;
-    } else {
-        // Use Multi-thread if user has set numberthread > 1
-        BackendConfig defaultConfig;
-        defaultConfig.flags = 4;
-        auto cpurt = rt.first.find(MNN_FORWARD_CPU);
-        if (cpurt != rt.first.end()) {
-            bnCache.cache.second.reset(cpurt->second->onCreate(&defaultConfig));
-        } else {
-            bnCache.cache.second.reset(rt.second->onCreate(&defaultConfig));
-        }
-    }
+    // Create Backend for prearrange
+    Session::createPipelineBackend(scheduleInfo.pipelineInfo[0], rt);
     if (config.rearrange) {
         mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get(), config.base);
     } else {

+ 12 - 5
include/MNN/Interpreter.hpp

@@ -224,11 +224,12 @@ public:
         // Default is 50
         CPU_LITTLECORE_DECREASE_RATE = 6,
 
-        // 0: Do not quantize kvcache, just store float
-        // 1: Only quantize key cache, use int8 asymmetric quantization 
-        // 2: Only quantize value cache, use fp8 quantization
-        // 3: quantize both key and value cache as described above
-        KVCACHE_QUANT_OPTIONS = 7,
+        // 0: Do not quantize
+        // 1: Only quantize key, use int8 asymmetric quantization 
+        // 2: Only quantize value, use fp8 quantization
+        // 3: quantize both key and value
+        // 4: quantize query, key and value, and use gemm int8 kernel to compute K*V
+        QKV_QUANT_OPTIONS = 7,
 
         // size limit of kvcache in memory (for a single layer)
         // if the size of kvcache exceeds the limit, it will be moved to disk
@@ -238,6 +239,12 @@ public:
     enum ExternalPathType {
         // Path of the kvcache directory
         EXTERNAL_PATH_KVCACHE_DIR = 0,
+        
+        // Mid Buffer Cache File
+        EXTERNAL_FEATUREMAP_DIR = 1,
+
+        // Weight Buffer Cache File
+        EXTERNAL_WEIGHT_DIR = 2,
 
         // Other types ...
     };

+ 1 - 1
include/MNN/MNNDefine.h

@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 4
+#define MNN_VERSION_PATCH 5
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */

+ 2 - 21
package_scripts/ios/buildiOS.sh

@@ -12,31 +12,12 @@ cd Static
 rm -rf ios_64
 mkdir ios_64
 cd ios_64
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="arm64" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_ARM82=true -DMNN_BUILD_SHARED_LIBS=false $1
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="arm64" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_ARM82=true -DMNN_BUILD_SHARED_LIBS=false -DMNN_USE_THREAD_POOL=OFF $1
 echo "Building AArch64"
 make MNN -j16
 echo "End Building AArch64"
 cd ../
 
-rm -rf ios_32
-mkdir ios_32
-cd ios_32
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="armv7;armv7s" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_BUILD_SHARED_LIBS=false $1
-echo "Building AArch32"
-make MNN -j16
-echo "End Building AArch32"
-cd ../
-
-find ios_32 -name "MNN*framework"
-find ios_64 -name "MNN*framework"
-
-mv ios_32/MNN.framework/MNN ios_32/MNN.framework/MNN_32
+mv ios_64/MNN.framework MNN.framework
 
-echo "Creating Fat Binary"
-lipo -create ios_32/MNN.framework/MNN_32 ios_64/MNN.framework/MNN -output ios_32/MNN.framework/MNN
-rm ios_32/MNN.framework/MNN_32
-echo "Patching Framework Headers"
-rm -rf ./MNN.framework
-cp -R ios_32/MNN.framework ./MNN.framework
-rm -rf ios_32
 rm -rf ios_64

+ 42 - 0
package_scripts/ios/buildiOS_with_armv7.sh

@@ -0,0 +1,42 @@
+#!/bin/sh
+echo "Change directory to MNN_SOURCE_ROOT/project/ios before running this script"
+echo "Current PWD: ${PWD}"
+
+rm -rf MNN-iOS-CPU-GPU
+mkdir MNN-iOS-CPU-GPU
+cd MNN-iOS-CPU-GPU
+# Static Begin
+mkdir Static 
+cd Static
+
+rm -rf ios_64
+mkdir ios_64
+cd ios_64
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="arm64" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_ARM82=true -DMNN_BUILD_SHARED_LIBS=false -DMNN_USE_THREAD_POOL=OFF $1
+echo "Building AArch64"
+make MNN -j16
+echo "End Building AArch64"
+cd ../
+
+rm -rf ios_32
+mkdir ios_32
+cd ios_32
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="armv7;armv7s" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_BUILD_SHARED_LIBS=false -DMNN_USE_THREAD_POOL=OFF $1
+echo "Building AArch32"
+make MNN -j16
+echo "End Building AArch32"
+cd ../
+
+find ios_32 -name "MNN*framework"
+find ios_64 -name "MNN*framework"
+
+mv ios_32/MNN.framework/MNN ios_32/MNN.framework/MNN_32
+
+echo "Creating Fat Binary"
+lipo -create ios_32/MNN.framework/MNN_32 ios_64/MNN.framework/MNN -output ios_32/MNN.framework/MNN
+rm ios_32/MNN.framework/MNN_32
+echo "Patching Framework Headers"
+rm -rf ./MNN.framework
+cp -R ios_32/MNN.framework ./MNN.framework
+rm -rf ios_32
+rm -rf ios_64

+ 4 - 4
package_scripts/mac/buildFrameWork.sh

@@ -18,7 +18,7 @@ cd Static
 # ARM
 mkdir mac_a64
 cd mac_a64
-cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 -DMNN_BUILD_SHARED_LIBS=OFF $1
+cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 -DMNN_BUILD_SHARED_LIBS=OFF $1
 echo "Building ARM64"
 make MNN -j16
 echo "End Building ARM64"
@@ -27,7 +27,7 @@ cd ../
 # X86
 mkdir mac_x64
 cd mac_x64
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 -DMNN_BUILD_SHARED_LIBS=OFF $1
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 -DMNN_BUILD_SHARED_LIBS=OFF $1
 echo "Building x86"
 make MNN -j16
 echo "End Building x86"
@@ -52,7 +52,7 @@ cd Dynamic
 # ARM
 mkdir mac_a64
 cd mac_a64
-cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 $1
+cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 $1
 echo "Building ARM64"
 make MNN -j16
 echo "End Building ARM64"
@@ -61,7 +61,7 @@ cd ../
 # X86
 mkdir mac_x64
 cd mac_x64
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 $1
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 $1
 echo "Building x86"
 make MNN -j16
 echo "End Building x86"

+ 0 - 1
project/android/build_32.sh

@@ -4,7 +4,6 @@ cmake ../../../ \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="armeabi-v7a" \
 -DANDROID_STL=c++_static \
--DCMAKE_BUILD_TYPE=Release \
 -DANDROID_NATIVE_API_LEVEL=android-14  \
 -DANDROID_TOOLCHAIN=clang \
 -DMNN_USE_LOGCAT=false \

+ 86 - 2
project/ios/MNN.xcodeproj/project.pbxproj

@@ -771,6 +771,25 @@
 		C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C4F906B127688C3A0026B847 /* NMSModule.hpp */; };
 		C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4F906B227688C3A0026B847 /* NMSModule.cpp */; };
 		C4FB6CB22769DF0800963B07 /* GeometryCumSum.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */; };
+		CE072A132C91AEE700F190FD /* MNNBGRToBGR555.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A032C91AEE700F190FD /* MNNBGRToBGR555.S */; };
+		CE072A142C91AEE700F190FD /* MNNBGRAToGRAY.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A042C91AEE700F190FD /* MNNBGRAToGRAY.S */; };
+		CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A052C91AEE700F190FD /* MNNRGBAToGRAYFast.S */; };
+		CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A062C91AEE700F190FD /* MNNBGRAToBGR.S */; };
+		CE072A172C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A072C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S */; };
+		CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A082C91AEE700F190FD /* MNNGRAYToC4Fast.S */; };
+		CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A092C91AEE700F190FD /* MNNBGRToGRAY.S */; };
+		CE072A1A2C91AEE700F190FD /* MNNRGBToGRAYFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0A2C91AEE700F190FD /* MNNRGBToGRAYFast.S */; };
+		CE072A1B2C91AEE700F190FD /* MNNBGRToBGR565.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0B2C91AEE700F190FD /* MNNBGRToBGR565.S */; };
+		CE072A1C2C91AEE700F190FD /* MNNRGBAToBGRFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0C2C91AEE700F190FD /* MNNRGBAToBGRFast.S */; };
+		CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S */; };
+		CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0E2C91AEE700F190FD /* MNNRGBToBGR555.S */; };
+		CE072A1F2C91AEE700F190FD /* MNNRGBToBGR.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0F2C91AEE700F190FD /* MNNRGBToBGR.S */; };
+		CE072A202C91AEE700F190FD /* MNNGRAYToC3Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A102C91AEE700F190FD /* MNNGRAYToC3Fast.S */; };
+		CE072A212C91AEE700F190FD /* MNNRGBToBGR565.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A112C91AEE700F190FD /* MNNRGBToBGR565.S */; };
+		CE072A222C91AEE700F190FD /* MNNPackC2.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A122C91AEE700F190FD /* MNNPackC2.S */; };
+		CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
+		CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
+		CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
 		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
 		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
@@ -805,6 +824,8 @@
 		CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; };
 		CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */; };
 		CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */; };
+		CEEDB5542C7475A100FED0DC /* MNNFileUtils.h in Headers */ = {isa = PBXBuildFile; fileRef = CEEDB5522C7475A100FED0DC /* MNNFileUtils.h */; };
+		CEEDB5552C7475A100FED0DC /* MNNFileUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEEDB5532C7475A100FED0DC /* MNNFileUtils.cpp */; };
 		EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
 		EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
 		EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */; };
@@ -1607,6 +1628,25 @@
 		C4F906B127688C3A0026B847 /* NMSModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = NMSModule.hpp; sourceTree = "<group>"; };
 		C4F906B227688C3A0026B847 /* NMSModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NMSModule.cpp; sourceTree = "<group>"; };
 		C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCumSum.cpp; sourceTree = "<group>"; };
+		CE072A032C91AEE700F190FD /* MNNBGRToBGR555.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRToBGR555.S; path = arm/arm64/MNNBGRToBGR555.S; sourceTree = "<group>"; };
+		CE072A042C91AEE700F190FD /* MNNBGRAToGRAY.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRAToGRAY.S; path = arm/arm64/MNNBGRAToGRAY.S; sourceTree = "<group>"; };
+		CE072A052C91AEE700F190FD /* MNNRGBAToGRAYFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBAToGRAYFast.S; path = arm/arm64/MNNRGBAToGRAYFast.S; sourceTree = "<group>"; };
+		CE072A062C91AEE700F190FD /* MNNBGRAToBGR.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRAToBGR.S; path = arm/arm64/MNNBGRAToBGR.S; sourceTree = "<group>"; };
+		CE072A072C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNSamplerC3BilinearOpt.S; path = arm/arm64/MNNSamplerC3BilinearOpt.S; sourceTree = "<group>"; };
+		CE072A082C91AEE700F190FD /* MNNGRAYToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGRAYToC4Fast.S; path = arm/arm64/MNNGRAYToC4Fast.S; sourceTree = "<group>"; };
+		CE072A092C91AEE700F190FD /* MNNBGRToGRAY.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRToGRAY.S; path = arm/arm64/MNNBGRToGRAY.S; sourceTree = "<group>"; };
+		CE072A0A2C91AEE700F190FD /* MNNRGBToGRAYFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToGRAYFast.S; path = arm/arm64/MNNRGBToGRAYFast.S; sourceTree = "<group>"; };
+		CE072A0B2C91AEE700F190FD /* MNNBGRToBGR565.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRToBGR565.S; path = arm/arm64/MNNBGRToBGR565.S; sourceTree = "<group>"; };
+		CE072A0C2C91AEE700F190FD /* MNNRGBAToBGRFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBAToBGRFast.S; path = arm/arm64/MNNRGBAToBGRFast.S; sourceTree = "<group>"; };
+		CE072A0D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBAToBGRAFast.S; path = arm/arm64/MNNRGBAToBGRAFast.S; sourceTree = "<group>"; };
+		CE072A0E2C91AEE700F190FD /* MNNRGBToBGR555.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToBGR555.S; path = arm/arm64/MNNRGBToBGR555.S; sourceTree = "<group>"; };
+		CE072A0F2C91AEE700F190FD /* MNNRGBToBGR.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToBGR.S; path = arm/arm64/MNNRGBToBGR.S; sourceTree = "<group>"; };
+		CE072A102C91AEE700F190FD /* MNNGRAYToC3Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGRAYToC3Fast.S; path = arm/arm64/MNNGRAYToC3Fast.S; sourceTree = "<group>"; };
+		CE072A112C91AEE700F190FD /* MNNRGBToBGR565.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToBGR565.S; path = arm/arm64/MNNRGBToBGR565.S; sourceTree = "<group>"; };
+		CE072A122C91AEE700F190FD /* MNNPackC2.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackC2.S; path = arm/arm64/MNNPackC2.S; sourceTree = "<group>"; };
+		CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
+		CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
+		CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
 		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
@@ -1643,6 +1683,8 @@
 		CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
+		CEEDB5522C7475A100FED0DC /* MNNFileUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNFileUtils.h; sourceTree = "<group>"; };
+		CEEDB5532C7475A100FED0DC /* MNNFileUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MNNFileUtils.cpp; sourceTree = "<group>"; };
 		EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
 		EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
 		EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OpRegister.cpp; path = ../arm82/Arm82OpRegister.cpp; sourceTree = "<group>"; };
@@ -1878,6 +1920,8 @@
 		488873AC215B639D0079B12E /* core */ = {
 			isa = PBXGroup;
 			children = (
+				CEEDB5532C7475A100FED0DC /* MNNFileUtils.cpp */,
+				CEEDB5522C7475A100FED0DC /* MNNFileUtils.h */,
 				48C84B9B250F722B00EE7666 /* Command.hpp */,
 				4819FB1524C138DF0050BD09 /* GeometryConvUtils.cpp */,
 				4819FB1324C138DF0050BD09 /* GeometryConvUtils.hpp */,
@@ -1921,6 +1965,25 @@
 		48887410215B639D0079B12E /* cpu */ = {
 			isa = PBXGroup;
 			children = (
+				CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */,
+				CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */,
+				CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */,
+				CE072A062C91AEE700F190FD /* MNNBGRAToBGR.S */,
+				CE072A042C91AEE700F190FD /* MNNBGRAToGRAY.S */,
+				CE072A032C91AEE700F190FD /* MNNBGRToBGR555.S */,
+				CE072A0B2C91AEE700F190FD /* MNNBGRToBGR565.S */,
+				CE072A092C91AEE700F190FD /* MNNBGRToGRAY.S */,
+				CE072A102C91AEE700F190FD /* MNNGRAYToC3Fast.S */,
+				CE072A082C91AEE700F190FD /* MNNGRAYToC4Fast.S */,
+				CE072A122C91AEE700F190FD /* MNNPackC2.S */,
+				CE072A0D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S */,
+				CE072A0C2C91AEE700F190FD /* MNNRGBAToBGRFast.S */,
+				CE072A052C91AEE700F190FD /* MNNRGBAToGRAYFast.S */,
+				CE072A0F2C91AEE700F190FD /* MNNRGBToBGR.S */,
+				CE072A0E2C91AEE700F190FD /* MNNRGBToBGR555.S */,
+				CE072A112C91AEE700F190FD /* MNNRGBToBGR565.S */,
+				CE072A0A2C91AEE700F190FD /* MNNRGBToGRAYFast.S */,
+				CE072A072C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S */,
 				CEE4566A2BC0E23D00F062C1 /* CPUExternalConst.cpp */,
 				95278CE62B9F0999009E9B29 /* CPUDynamicQuant.cpp */,
 				95278CE52B9F0999009E9B29 /* CPUDynamicQuant.hpp */,
@@ -2969,6 +3032,7 @@
 				489D7A982550FDC900AD896A /* MNNMetalContext.h in Headers */,
 				952298B82B4D4CC80043978B /* coreMLLayerNorm.hpp in Headers */,
 				92FF029323AA0B5A00AC97F6 /* CPURange.hpp in Headers */,
+				CEEDB5542C7475A100FED0DC /* MNNFileUtils.h in Headers */,
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
@@ -3260,6 +3324,8 @@
 				950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
 				92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */,
 				92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
+				CE072A212C91AEE700F190FD /* MNNRGBToBGR565.S in Sources */,
+				CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */,
 				92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */,
 				48FA474623AA127B00172C3B /* NeuralNetWorkOp.cpp in Sources */,
 				4D9A936E26255BDA00F9B43C /* CoreMLArgMax.cpp in Sources */,
@@ -3270,6 +3336,7 @@
 				48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */,
 				92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
 				48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */,
+				CE072A1C2C91AEE700F190FD /* MNNRGBAToBGRFast.S in Sources */,
 				CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */,
 				48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */,
 				92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
@@ -3296,6 +3363,7 @@
 				4D9A935F26255BDA00F9B43C /* NeuralNetwork.pb-c.c in Sources */,
 				4D0C80E32862FC4100C7CAD6 /* CoreMLOPRegister.cpp in Sources */,
 				92FF02BE23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
+				CE072A1A2C91AEE700F190FD /* MNNRGBToGRAYFast.S in Sources */,
 				4A224A0B27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.cpp in Sources */,
 				48608B52250632EC00CB1D71 /* GeometryComputerUtils.cpp in Sources */,
 				489D7A8A2550FDC900AD896A /* MetalConvolutionDepthwise.mm in Sources */,
@@ -3330,6 +3398,7 @@
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
+				CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
 				EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
 				481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
 				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
@@ -3353,6 +3422,7 @@
 				48747D6F245D9E33000B9709 /* GeometryConcat.cpp in Sources */,
 				4819FB3224C1396A0050BD09 /* GeometryReduce.cpp in Sources */,
 				950B28EF29F627F70002F454 /* MNNBinaryMaxInt8.S in Sources */,
+				CE072A132C91AEE700F190FD /* MNNBGRToBGR555.S in Sources */,
 				92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
 				92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
 				92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
@@ -3395,6 +3465,7 @@
 				48958783268EBA7C00EA01A7 /* ShapeSegmentMean.cpp in Sources */,
 				48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
 				92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
+				CE072A1B2C91AEE700F190FD /* MNNBGRToBGR565.S in Sources */,
 				48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
 				48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
 				4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */,
@@ -3449,6 +3520,7 @@
 				92FF034223AA0B5A00AC97F6 /* CPUReduction.cpp in Sources */,
 				92FF02CF23AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */,
 				C4F906B0276886040026B847 /* GeometryTopK.cpp in Sources */,
+				CEEDB5552C7475A100FED0DC /* MNNFileUtils.cpp in Sources */,
 				48CA2F572681844C003A1796 /* MNNUnpackC8FP16.S in Sources */,
 				92FF030E23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
 				4837147225A599EC004DBDED /* Arm82Binary.cpp in Sources */,
@@ -3473,6 +3545,7 @@
 				4D9A936726255BDA00F9B43C /* CoreMLReduction.cpp in Sources */,
 				48F5881324DEA3F000C484A2 /* GeometryConv3D.cpp in Sources */,
 				4882C8BA241A22B800DAC168 /* OpCommonUtils.cpp in Sources */,
+				CE072A202C91AEE700F190FD /* MNNGRAYToC3Fast.S in Sources */,
 				92FF02B523AA0B5A00AC97F6 /* CPUTopKV2.cpp in Sources */,
 				92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */,
 				489D7A872550FDC900AD896A /* MetalOPRegister.mm in Sources */,
@@ -3536,17 +3609,21 @@
 				4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
 				11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
+				CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */,
 				CECF8C7D299CAD9400D3875B /* md5.c in Sources */,
 				92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
 				92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
+				CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */,
 				CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */,
 				92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */,
+				CE072A222C91AEE700F190FD /* MNNPackC2.S in Sources */,
 				92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
 				92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
 				48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */,
 				4838EA8B2611C1310027232C /* ShapeGridSample.cpp in Sources */,
 				92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */,
 				92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
+				CE072A172C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S in Sources */,
 				92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */,
 				48F5881124DEA3F000C484A2 /* GeometryPooling3D.cpp in Sources */,
 				92FF042423AA0B7100AC97F6 /* ShapeROIPooling.cpp in Sources */,
@@ -3569,11 +3646,13 @@
 				92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */,
 				4D9A936226255BDA00F9B43C /* FeatureTypes.pb-c.c in Sources */,
 				486E1A9924F5078D00C16006 /* CPURandomUniform.cpp in Sources */,
+				CE072A1F2C91AEE700F190FD /* MNNRGBToBGR.S in Sources */,
 				92FF02C823AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
 				92FF045C23AA0B7100AC97F6 /* ShapeBroadcastTo.cpp in Sources */,
 				48747D49245D9D24000B9709 /* RuntimeFactory.cpp in Sources */,
 				92FF02AE23AA0B5A00AC97F6 /* CPUProposal.cpp in Sources */,
 				92FF042723AA0B7100AC97F6 /* ShapeMatMul.cpp in Sources */,
+				CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */,
 				92FF042823AA0B7100AC97F6 /* ShapeInterp.cpp in Sources */,
 				92FF02D623AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				48FB9DCA24A848D0008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
@@ -3610,6 +3689,7 @@
 				CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
 				48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
 				92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
+				CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */,
 				48A8A61A21D101DE00C2B9A7 /* Matrix_CV.cpp in Sources */,
 				4DDD8E102B1D70C1005065D1 /* MNNTranspose16Bit8x8.S in Sources */,
 				489D7A8C2550FDC900AD896A /* MetalDeconvolution.mm in Sources */,
@@ -3659,6 +3739,7 @@
 				48F9E54C2493511200E46522 /* MNNPackedMatMul.S in Sources */,
 				C4D4824327BA67DE0021C2B9 /* GeometryDet.cpp in Sources */,
 				92FF026F23AA0B5A00AC97F6 /* CPUInt8ToFloat.cpp in Sources */,
+				CE072A142C91AEE700F190FD /* MNNBGRAToGRAY.S in Sources */,
 				92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */,
 				4D4CF4682760946500A36D9F /* geometric.cpp in Sources */,
 				92FF045023AA0B7100AC97F6 /* ShapeCropAndResize.cpp in Sources */,
@@ -3671,6 +3752,7 @@
 				92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
 				CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
 				CECF8C78299CAD9400D3875B /* log_util_imp.cpp in Sources */,
+				CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */,
 				92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
 				952298B22B4D39050043978B /* MetalLoop.mm in Sources */,
 				48925F372744AC2A00919B37 /* ShapeROIAlign.cpp in Sources */,
@@ -3691,6 +3773,7 @@
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
 				4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
 				EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
+				CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
 				48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
 				48608B51250632EC00CB1D71 /* GeometryComputer.cpp in Sources */,
 				92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
@@ -3720,6 +3803,7 @@
 				92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
 				92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
+				CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
 				489D7A762550FDC800AD896A /* MetalReduction.mm in Sources */,
 				92FF032023AA0B5A00AC97F6 /* MNNMatrixSub.S in Sources */,
 				C43C81FF251894BD00A0FF84 /* ThreadPool.cpp in Sources */,
@@ -4101,7 +4185,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -4188,7 +4272,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (

+ 12 - 4
pymnn/test/model_test.py

@@ -80,18 +80,21 @@ def MNNDataType2NumpyDataType(data_type):
     else:
         return np.float32
 
-def createTensor(tensor, file=''):
+def createTensor(tensor, file='', empty=False):
     shape = tensor.getShape()
     data_type = tensor.getDataType()
     dtype = MNNDataType2NumpyDataType(data_type)
     if file == '':
-        data = np.ones(shape, dtype=dtype)
+        if empty:
+            data = np.zeros(shape, dtype=dtype)
+        else:
+            data = np.ones(shape, dtype=dtype)
     else:
         data = loadtxt(file, shape, dtype)
-    return MNN.Tensor(shape, tensor.getDataType(), data, tensor.getDimensionType())
+    return MNN.Tensor(shape, tensor.getDataType(), data.copy(), tensor.getDimensionType())
 
 def compareTensor(tensor, file, tolerance=5e-2):
-    outputNumpyData = tensor.getNumpyData()
+    outputNumpyData = tensor.getNumpyData().copy()
     expectNumpyData = loadtxt(file, tensor.getShape())
     max_abs_dif = np.abs(outputNumpyData - expectNumpyData).max()
     max_exp_val = np.abs(expectNumpyData).max()
@@ -117,6 +120,11 @@ def modelTest(modelPath, givenName, expectName):
     net = MNN.Interpreter(modelPath)
     session = net.createSession()
     allInput = net.getSessionInputAll(session)
+    # zero for all inputs
+    for name in allInput:
+        inputTensor = allInput[name] 
+        inputHost = createTensor(inputTensor, givenName, True)
+        inputTensor.copyFrom(inputHost)
     # input
     inputTensor = net.getSessionInput(session)
     inputHost = createTensor(inputTensor, givenName)

+ 1 - 0
source/backend/arm82/Arm82Backend.cpp

@@ -118,6 +118,7 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
         CPUBackend::onCopyBuffer(srcTensor, dstTensor);
         return;
     }
+    _resetDynamicMemory();
     auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
     auto dest   = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
     auto srcType = MNN_FORWARD_CPU;

+ 10 - 14
source/backend/arm82/Arm82Functions.cpp

@@ -35,12 +35,14 @@ void MNNPackedMatMulFP16(float* C, const float* A, const float* B, const size_t*
 // parameter: [aStride, l, h, cStride, bExtraStride]
 void MNNPackedMatMulRemainFP16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 void MNNPackedMatMulFP16_int4(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 void MNNPackedMatMulRemainFP16_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 void MNNPackedMatMulFP16_int8(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 void MNNPackedMatMulRemainFP16_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
+#endif
 
+#ifdef MNN_LOW_MEMORY
 void MNNAbsMaxFP16(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack);
 void MNNQuantScaleFP16(float* sum, float* absmax, float* quant_scale, float* dequant_scale, size_t thread, size_t batch);
 void MNNDynamicQuantFP16(const float* src, int8_t* dst, const float* scale, size_t src_depth_quad, size_t realSize, int pack);
@@ -48,8 +50,6 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
 #endif
 #if defined(__aarch64__)
 void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
-void MNNSumByAxisLForMatmul_A_ARM86(float* dest, int8_t* source, const float* dequantScale, ssize_t realDstCount, SumByAxisParams sumParams);
-void MNNSumByAxisLForMatmul_A_ARM82(float* dest, int8_t* source, const float* dequantScale, ssize_t realDstCount, SumByAxisParams sumParams);
 #endif
 void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
 
@@ -735,29 +735,25 @@ bool Arm82Functions::init() {
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul, MNNPackedMatMulFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain, MNNPackedMatMulRemainFP16);
 #if defined(__aarch64__)
-#ifdef MNN_LOW_MEMORY
+    gInstance->supportFp16arith = origin->supportFp16arith;
+    gInstance->supportSDot = origin->supportSDot;
+    gInstance->supportI8mm = origin->supportI8mm;
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
     // Weight Dequant Gemm Kernels
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul_int4, MNNPackedMatMulFP16_int4);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain_int4, MNNPackedMatMulRemainFP16_int4);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul_int8, MNNPackedMatMulFP16_int8);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain_int8, MNNPackedMatMulRemainFP16_int8);
+#endif
+#ifdef MNN_LOW_MEMORY
     // Dynamic Qaunt Helper Functions
     FUNC_PTR_ASSIGN(gInstance->MNNAbsMax, MNNAbsMaxFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNQuantScale, MNNQuantScaleFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNDynamicQuant, MNNDynamicQuantFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNQuantSum, MNNQuantSumFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
-    // Dynamic Quant Gemm Kernels.
-    gInstance->supportFp16arith = origin->supportFp16arith;
-    gInstance->supportSDot = origin->supportSDot;
-    gInstance->supportI8mm = origin->supportI8mm;
 #endif
-    if (gInstance->supportSDot) {
-        FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, MNNSumByAxisLForMatmul_A_ARM82);
-    }
-    if (gInstance->supportI8mm) {
-        FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, MNNSumByAxisLForMatmul_A_ARM86);
-    }
+    FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
 #endif
     FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
     FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);

+ 7 - 0
source/backend/arm82/CMakeLists.txt

@@ -10,10 +10,17 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
     if (MNN_LOW_MEMORY)
         file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
     endif()
+    if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+        file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/normal_memory/*)
+    endif()
     add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
     if (MNN_LOW_MEMORY)
         target_compile_options(MNN_Arm82 PRIVATE -DMNN_LOW_MEMORY)
     endif()
+    
+    if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+        target_compile_options(MNN_Arm82 PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+    endif()
     target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -DENABLE_ARMV82)
 else()
 # Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design

+ 17 - 7
source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuanInput_ARM82.S

@@ -90,21 +90,31 @@
 Note: Only used in dynamic quant,so do not need compare min max!
  */
 asm_function DynamicQuanInput_ARM82
-//void DynamicQuanInput_ARM82(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, size_t zeroPoint);
-//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint
+//void DynamicQuanInput_ARM82(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, float* zeroPoint, ssize_t quanParamVec);
+//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint, x7:quanParamVec
 stp d14, d15, [sp, #-64]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 
 ld1 {v29.s}[0], [x3] // Load scale
-// copy zero point
-dup v30.4s, w6
-fcvtn v31.4h, v29.4s
-scvtf v30.4s, v30.4s
+ld1 {v30.s}[0], [x6] // Load zero
+
+and x8, x7, #1 // if load vector scale
+and x9, x7, #2 // if load vector zero
+cbz x8, LOAD_VECTOR_ZERO
+ld1 {v29.4s}, [x3] // scale
+
+LOAD_VECTOR_ZERO:
+cbz x9, START
+ld1 {v30.4s}, [x6] // zero
 
+
+START:
+// copy zero point
+fcvtn v31.4h, v29.4s // fp16 scale
+fcvtn v30.4h, v30.4s // fp16 zero
 dup v31.8h, v31.h[0]
-fcvtn v30.4h, v30.4s
 dup v30.8h, v30.h[0]
 
 FL28:

+ 3 - 6
source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantAndReorder_ARM82.S

@@ -1,5 +1,5 @@
 //
-//  DynamicQuanInput_ARM82.S
+//  DynamicQuanInputAndReorder_ARM82.S
 //  MNN
 //
 //  Created by MNN on 2019/01/22.
@@ -101,15 +101,12 @@ stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 
 ld1 {v29.s}[0], [x3] // Load scale
-// copy zero point
-dup v30.4s, w6
+ld1 {v30.s}[0], [x6] // Load zero point
 fcvtn v31.4h, v29.4s
-scvtf v30.4s, v30.4s
-
+fcvtn v30.4h, v30.4s
 add x13, x8, x8
 
 dup v31.8h, v31.h[0]
-fcvtn v30.4h, v30.4s
 dup v30.8h, v30.h[0]
 
 mov x9, x1 // first N*4

+ 13 - 38
source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S

@@ -115,19 +115,16 @@ ldr x27, [x6, #64]  // blockNum
 mov x21, #16 // sizeof(float16_t) * PACK
 mul x27, x27, x3
 Start:
-lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
 mov x22, #48 // src_steps
-add x24, x15, x15
 ldr x27, [x6, #80] // extra scale
 TILE_12:
     cmp x7, #12
     blt TILE_8
 
 L8LoopDz_TILE_12:
-    // ld1 {v0.4s, v1.4s}, [x9], #32 // bias
     mov x11, x1
     mov x13, x3
-    // Init 0
     SET_BIAS v8, v9, v10, v11
     SET_BIAS v12, v13, v14, v15
     SET_BIAS v16, v17, v18, v19
@@ -137,13 +134,13 @@ L8LoopDz_TILE_12:
 
     mov x28, x2
     L8LoopSz_TILE_12:
-        ld1 {v3.16b}, [x2], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x2], #32 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x2], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
@@ -156,7 +153,7 @@ L8LoopDz_TILE_12:
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -169,9 +166,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    //add x2, x2, x15
-    //add x24, x15, x15
-    add x2, x28, x24
+    add x2, x28, x15
     sub x5, x5, #1
 
     L8Tile12Quan:
@@ -217,8 +212,6 @@ L8LoopDz_TILE_12:
     MLA_WEIGHTZERO v18, v4, v5, 2 // tile:10, oc:0-3 
     MLA_WEIGHTZERO v19, v4, v5, 3 // tile:11, oc:0-3
 
-    //ld1r {v0.4s}, [x23] // f32 min
-    //ld1r {v1.4s}, [x24] // f32 max
     MLA_WEIGHTZERO v20, v2, v6, 0 // tile:0, oc:4-7
     MLA_WEIGHTZERO v21, v2, v6, 1 // tile:1, oc:4-7
     MLA_WEIGHTZERO v22, v2, v6, 2 // tile:2, oc:4-7
@@ -297,8 +290,6 @@ L8LoopDz_TILE_12:
     blt End
 
 TILE_8:
-    //ld1r {v26.4s}, [x23] // f32 min
-    //ld1r {v27.4s}, [x24] // f32 max
     cmp x7, #8
     blt TILE_4
     mov x10, x0
@@ -319,18 +310,18 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
     mov x28, x12
     L8LoopSz_TILE_8:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x12], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -343,9 +334,7 @@ L8LoopDz_TILE_8:
         bne L8LoopSz_TILE_8
 
     L8LoopSzEnd_TILE_8:
-    //add x12, x12, x15
-    //add x24, x15, x15
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile8Quan:
@@ -468,15 +457,13 @@ L8LoopDz_TILE_4:
 
     mov x28, x12
     L8LoopSz_TILE_4:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -484,9 +471,7 @@ L8LoopDz_TILE_4:
         bne L8LoopSz_TILE_4
 
     L8LoopSzEnd_TILE_4:
-    //add x12, x12, x15
-    //add x24, x15, x15
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile4Quan:
@@ -571,23 +556,17 @@ L8LoopDz_TILE_1:
 
     movi v8.16b, #0
     movi v9.16b, #0
-    //mov v8.16b, v0.16b
-    //mov v9.16b, v1.16b
     mov x28, x12
     L8LoopSz_TILE_1:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
         bne L8LoopSz_TILE_1
 
     L8LoopSzEnd_TILE_1:
-    //add x12, x12, x15
-    //add x24, x15, x15
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile1Quan:
@@ -630,11 +609,7 @@ L8LoopDz_TILE_1:
     sub x23, x23, #2
     fmax v0.8h, v24.8h, v0.8h
     fmin v0.8h, v25.8h, v0.8h
-    // st1 {v8.4s}, [x10], x4
-    // st1 {v9.4s}, [x10], x4
 
-    //fcvtn v0.4h, v8.4s
-    //fcvtn2 v0.8h, v9.4s
     TILE1_STORE:
     st1 {v0.8h}, [x10], x4
 

+ 69 - 60
source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S

@@ -114,16 +114,14 @@ ldr x27, [x6, #64]  // blockNum
 mov x21, #16 // sizeof(float16_t) * PACK
 mul x27, x27, x3
 Start:
-lsl x15, x27, #3 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 mov x22, #48 // src_steps
-add x24, x15, x15
 ldr x27, [x6, #80] // extra scale
 TILE_12:
     cmp x7, #12
     blt TILE_8
 
 L8LoopDz_TILE_12:
-    // ld1 {v0.4s, v1.4s}, [x9], #32 // bias
     mov x11, x1
     mov x13, x3
     movi v7.16b, #15
@@ -138,13 +136,11 @@ L8LoopDz_TILE_12:
 
     mov x28, x2
     L8LoopSz_TILE_12:
-        ld1 {v3.d}[0], [x2], x15 // weight
-        ld1 {v4.d}[0], [x2], #8
+        ld1 {v5.16b}, [x2], #16 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -155,10 +151,6 @@ L8LoopDz_TILE_12:
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
         .inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
@@ -168,7 +160,7 @@ L8LoopDz_TILE_12:
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -181,7 +173,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    add x2, x28, x24
+    add x2, x28, x15
     sub x5, x5, #1
 
     L8Tile12Quan:
@@ -227,8 +219,6 @@ L8LoopDz_TILE_12:
     MLA_WEIGHTZERO v18, v4, v5, 2 // tile:10, oc:0-3 
     MLA_WEIGHTZERO v19, v4, v5, 3 // tile:11, oc:0-3
 
-    //ld1r {v0.4s}, [x23] // f32 min
-    //ld1r {v1.4s}, [x24] // f32 max
     MLA_WEIGHTZERO v20, v2, v6, 0 // tile:0, oc:4-7
     MLA_WEIGHTZERO v21, v2, v6, 1 // tile:1, oc:4-7
     MLA_WEIGHTZERO v22, v2, v6, 2 // tile:2, oc:4-7
@@ -304,7 +294,7 @@ L8LoopDz_TILE_12:
     L8Tile12LoopCheck:
     cmp x5, #1
     bge L8LoopDz_TILE_12
-    blt End
+    b End
 
 TILE_8:
     cmp x7, #8
@@ -327,27 +317,24 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
     mov x28, x12
     L8LoopSz_TILE_8:
-        ld1 {v3.d}[0], [x12], x15 // weight
-        ld1 {v4.d}[0], [x12], #8
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
+        //zip1 v3.16b, v5.16b, v6.16b
+        //zip2 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -360,7 +347,7 @@ L8LoopDz_TILE_8:
         bne L8LoopSz_TILE_8
 
     L8LoopSzEnd_TILE_8:
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile8Quan:
@@ -446,10 +433,6 @@ L8LoopDz_TILE_8:
     st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x10], #64
     st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x4
 
-    //st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64
-    //st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
-    //st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], #64
-    //st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4
     add x4, x4, #64
 
     L8Tile8LoopCheck:
@@ -483,24 +466,20 @@ L8LoopDz_TILE_4:
 
     mov x28, x12
     L8LoopSz_TILE_4:
-        ld1 {v3.d}[0], [x12], x15 // weight
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.d}[0], [x12], #8 // weight
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
+        //zip1 v3.16b, v5.16b, v6.16b
+        //zip2 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
+
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -508,7 +487,7 @@ L8LoopDz_TILE_4:
         bne L8LoopSz_TILE_4
 
     L8LoopSzEnd_TILE_4:
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile4Quan:
@@ -593,29 +572,61 @@ L8LoopDz_TILE_1:
 
     movi v8.16b, #0
     movi v9.16b, #0
+    
     mov x28, x12
-    L8LoopSz_TILE_1:
-        ld1 {v3.d}[0], [x12], x15 // weight
+    cmp x13, #4
+    blt L8LoopSz_TILE_1_lu1
+
+    L8LoopSz_TILE_1_lu4:
+        ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x12], #64 // weight: hu=0,1,2,3,pack=0~7
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.d}[0], [x12], #8 // weight
-        // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ld1 {v0.s}[1], [x11], x22
+        ld1 {v0.s}[2], [x11], x22
+        ld1 {v0.s}[3], [x11], x22
 
-        .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
+        sub x13, x13, #4
+        // int4->int8
+        ushr v12.16b, v3.16b, #4
+        and v22.16b, v3.16b, v7.16b
+
+        ushr v15.16b, v4.16b, #4
+        and v23.16b, v4.16b, v7.16b
+
+        ushr v18.16b, v5.16b, #4
+        and v24.16b, v5.16b, v7.16b
+
+        ushr v21.16b, v6.16b, #4
+        and v25.16b, v6.16b, v7.16b
+
+        cmp x13, #4
+        //sub x12, x12, x15
+        .inst 0x4f80e188 // sdot v8.4s, v12.16b, v0.4b[0]
+        .inst 0x4f80e2c9 // sdot v9.4s, v22.16b, v0.4b[0]
+        .inst 0x4fa0e1e8 // sdot v8.4s, v15.16b, v0.4b[1]
+        .inst 0x4fa0e2e9 // sdot v9.4s, v23.16b, v0.4b[1]
+        .inst 0x4f80ea48 // sdot v8.4s, v18.16b, v0.4b[2]
+        .inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]
+        .inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]
+        .inst 0x4fa0eb29 // sdot v9.4s, v25.16b, v0.4b[3]
+        bge L8LoopSz_TILE_1_lu4
+
+    cbz x13, L8LoopSzEnd_TILE_1
+
+    L8LoopSz_TILE_1_lu1:
+        ld1 {v4.16b}, [x12], #16 // weight
+        ld1 {v0.s}[0], [x11], x22 // src
+        //ld1 {v4.d}[0], [x12], #8 // weight
         subs x13, x13, #1
         // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
-        sub x12, x12, x15
+        ushr v3.16b, v4.16b, #4
+        and v12.16b, v4.16b, v7.16b
 
-        .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
-        bne L8LoopSz_TILE_1
+        .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
+        .inst 0x4f80e189 // sdot v9.4s, v12.16b, v0.4b[0]
+        bne L8LoopSz_TILE_1_lu1
 
     L8LoopSzEnd_TILE_1:
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile1Quan:
@@ -658,8 +669,6 @@ L8LoopDz_TILE_1:
     sub x23, x23, #2
     fmax v0.8h, v24.8h, v0.8h
     fmin v0.8h, v25.8h, v0.8h
-    // st1 {v8.4s}, [x10], x4
-    // st1 {v9.4s}, [x10], x4
     TILE1_STORE:
     st1 {v0.8h}, [x10], x4
 

+ 61 - 23
source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S

@@ -150,6 +150,7 @@ LoopDz_TILE_10:
     mov x12, x2 // weight
     mov x13, x3 // src_depth_quad
     mov x10, x0 // tag dst address
+    movi v2.16b, #15
 
     SET_0_5 v12, v16, v20, v24, v28 // oc:0,1,0,1
     SET_0_5 v13, v17, v21, v25, v29 // oc:2,3,2,3
@@ -158,7 +159,6 @@ LoopDz_TILE_10:
 
 LoopSz_TILE_10:
     ld1 {v0.16b, v1.16b}, [x12], #32                    // weight
-    movi v2.16b, #15
     ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x11], #64    // src: E0-E9
     ld1 {v7.16b}, [x11], #16
     // int4->int8
@@ -763,50 +763,88 @@ TILE_1:
     mov x20, x9 // bias
     mov x6, x28 // weightQuanBias
 LoopDz_TILE_1:
-    //ld1 {v7.4s, v8.4s}, [x20], #32  // bias
     mov x11, x1 // src
     mov x12, x25 // weight
     mov x13, x3 // src_depth_quad
     mov x10, x26
 
-    //dup v16.2d, v7.d[0] // oc:0,1,0,1
-    //dup v17.2d, v7.d[1] // oc:2,3,2,3
-    //dup v18.2d, v8.d[0] // oc:4,5,4,5
-    //dup v19.2d, v8.d[1] // oc:6,7,6,7
     movi v16.4s, #0 // oc:0,1,0,1
     movi v17.4s, #0 // oc:2,3,2,3
     movi v18.4s, #0 // oc:4,5,4,5
     movi v19.4s, #0 // oc:6,7,6,7
 
-    //movi v22.4s, #0 // oc:0,1,0,1
-    //movi v23.4s, #0 // oc:2,3,2,3
-    //movi v24.4s, #0 // oc:4,5,4,5
-    //movi v25.4s, #0 // oc:6,7,6,7
+    cmp x13, #4
+    blt LoopSz1_TILE_1_lu1
+LoopSz1_TILE_1_lu4:
+    ld1 {v5.16b, v6.16b, v7.16b, v8.16b}, [x12], #64     // weight
+    ld1 {v9.16b, v10.16b, v11.16b, v12.16b}, [x12], #64
+    ld1 {v0.8b}, [x11], x22                              // src
+    ld1 {v1.8b}, [x11], x22
+    ld1 {v2.8b}, [x11], x22
+    ld1 {v3.8b}, [x11], x22
+
+    // int4->int8
+    ushr v4.16b, v5.16b, #4
+    ushr v14.16b, v6.16b, #4
+    and v13.16b, v5.16b, v30.16b
+    and v15.16b, v6.16b, v30.16b
+
+    ushr v20.16b, v7.16b, #4
+    ushr v21.16b, v8.16b, #4
+    and v22.16b, v7.16b, v30.16b
+    and v23.16b, v8.16b, v30.16b
+
+    ushr v24.16b, v9.16b, #4
+    ushr v25.16b, v10.16b, #4
+    and v26.16b, v9.16b, v30.16b
+    and v27.16b, v10.16b, v30.16b
+
+    ushr v5.16b, v11.16b, #4
+    ushr v6.16b, v12.16b, #4
+    and v7.16b, v11.16b, v30.16b
+    and v8.16b, v12.16b, v30.16b
+
+    sub x13, x13, #4
+
+    .inst 0x4e84a410 // smmla v16.4s, v0.16b, v4.16b
+    .inst 0x4e8ea411 // smmla v17.4s, v0.16b, v14.16b
+    .inst 0x4e8da412 // smmla v18.4s, v0.16b, v13.16b
+    .inst 0x4e8fa413 // smmla v19.4s, v0.16b, v15.16b
+
+    .inst 0x4e94a430 // smmla v16.4s, v1.16b, v20.16b
+    .inst 0x4e95a431 // smmla v17.4s, v1.16b, v21.16b
+    .inst 0x4e96a432 // smmla v18.4s, v1.16b, v22.16b
+    .inst 0x4e97a433 // smmla v19.4s, v1.16b, v23.16b
+    cmp x13, #4
+    .inst 0x4e98a450 // smmla v16.4s, v2.16b, v24.16b
+    .inst 0x4e99a451 // smmla v17.4s, v2.16b, v25.16b
+    .inst 0x4e9aa452 // smmla v18.4s, v2.16b, v26.16b
+    .inst 0x4e9ba453 // smmla v19.4s, v2.16b, v27.16b
+
+    .inst 0x4e85a470 // smmla v16.4s, v3.16b, v5.16b
+    .inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b
+    .inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b
+    .inst 0x4e88a473 // smmla v19.4s, v3.16b, v8.16b
+    
+    bge LoopSz1_TILE_1_lu4
+    cbz x13, LoopSzEnd_TILE_1
 
-LoopSz1_TILE_1:
-    // src    : 1 x [1 x 8] : v2
-    // weight : 2 x [2 x 8] : v0-1
-    // dst    : 1 x 2 x [2] : v30-v31
+LoopSz1_TILE_1_lu1:
     ld1 {v13.16b, v14.16b}, [x12], #32   // weight
-    ld1 {v2.8b}, [x11], x22           // src
+    ld1 {v2.8b}, [x11], x22              // src
     // int4->int8
     ushr v0.16b, v13.16b, #4
     and v3.16b, v13.16b, v30.16b
     ushr v1.16b, v14.16b, #4
     and v4.16b, v14.16b, v30.16b
+    subs x13, x13, #1
 
     .inst 0x4e80a450 // smmla v16.4s, v2.16b, v0.16b
     .inst 0x4e81a451 // smmla v17.4s, v2.16b, v1.16b
     .inst 0x4e83a452 // smmla v18.4s, v2.16b, v3.16b
     .inst 0x4e84a453 // smmla v19.4s, v2.16b, v4.16b
-    subs x13, x13, #1
-    bne LoopSz1_TILE_1
-
-    LoopSz_TILE_1_ADD:
-    //add v16.4s, v16.4s, v22.4s
-    //add v17.4s, v17.4s, v23.4s
-    //add v18.4s, v18.4s, v24.4s
-    //add v19.4s, v19.4s, v25.4s
+    
+    bne LoopSz1_TILE_1_lu1
 
 LoopSzEnd_TILE_1:
     add x25, x25, x15

source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S → source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int4.S


source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S → source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int8.S


source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S → source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int4.S


source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S → source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int8.S


+ 1 - 1
source/backend/coreml/backend/CoreMLBackend.cpp

@@ -300,7 +300,7 @@ namespace MNN {
 
     CoreMLRuntime::~CoreMLRuntime() {}
 
-    Backend* CoreMLRuntime::onCreate(const BackendConfig* config) const {
+    Backend* CoreMLRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
         return new CoreMLBackend(this);
     }
 

+ 1 - 1
source/backend/coreml/backend/CoreMLBackend.hpp

@@ -26,7 +26,7 @@ namespace MNN {
         CoreMLRuntime(const Backend::Info& info);
         virtual ~CoreMLRuntime();
         virtual CompilerType onGetCompilerType() const override;
-        virtual Backend* onCreate(const BackendConfig* conf) const override;
+        virtual Backend* onCreate(const BackendConfig* conf, Backend* origin) const override;
         virtual void onGabageCollect(int level) override;
         virtual std::pair<const void*, size_t> onGetCache() override {
             return std::make_pair(mCacheBuffer, mCacheSize);

+ 4 - 0
source/backend/cpu/CMakeLists.txt

@@ -24,6 +24,10 @@ if(MNN_LOW_MEMORY)
   target_compile_options(MNNCPU PRIVATE -DMNN_LOW_MEMORY)
 endif()
 
+if(MNN_CPU_WEIGHT_DEQUANT_GEMM)
+  target_compile_options(MNNCPU PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+endif()
+
 # X86_64 AVX/SSE
 if (MNN_USE_SSE)
     include(${CMAKE_CURRENT_LIST_DIR}/x86_x64/CMakeLists.txt)

+ 144 - 39
source/backend/cpu/CPUAttention.cpp

@@ -30,22 +30,51 @@
 namespace MNN {
 
 template <typename T>
-static void pack_query(Tensor* query, char* pack_q, int mNumHead, int mHeadDim, int eP, int seq_len, int h, float q_scale) {
-    T * query_src = query->host<T>();
-    T * query_dst = reinterpret_cast<T*>(pack_q);
-    for (int i = 0; i < seq_len; i++) {
-        int out_index = i / eP;
-        int in_index  = i % eP;
-        for (int j = 0; j < mHeadDim; j++) {
-            query_dst[out_index * mHeadDim * eP + j * eP + in_index] = query_src[i * mNumHead * mHeadDim + h * mHeadDim + j] * q_scale;
+void CPUAttention::pack_query(Tensor* query, char* pack_q, char* sum_q, int seq_len, int h, float q_scale) {
+    if (mUseGemmInt8) { // Shape of Query: numhead, [seqlen/eP8, headdim/lP8, eP8, lP8]
+        mMinQ[h] = query->host<T>()[h * mHeadDim];
+        mMaxQ[h] = query->host<T>()[h * mHeadDim];
+        for (int i = 0; i < seq_len; i++) {
+            T * query_src = query->host<T>() + i * mNumHead * mHeadDim + h * mHeadDim;
+            for (int j = 0; j < mHeadDim; j++) {
+                mMinQ[h] = ALIMIN(mMinQ[h], query_src[j]);
+                mMaxQ[h] = ALIMAX(mMaxQ[h], query_src[j]);
+            }
+        }
+        mQueryScale[h] = (mMaxQ[h] - mMinQ[h]) / 255.0f;
+        mQueryZeroPoint[h] = -255.0f * mMinQ[h] / (mMaxQ[h] - mMinQ[h]) - 128.0;
+        for (int i = 0; i < seq_len; i++) {
+            T * query_src = query->host<T>() + i * mNumHead * mHeadDim + h * mHeadDim;
+            float sumQ = 0;
+            int out_index = i / eP8;
+            int in_index  = i % eP8;
+            for (int j = 0; j < mHeadDim; j++) {
+                int a = j / lP8;
+                int b = j % lP8;
+                int quant_res = (int)roundf(query_src[j] / mQueryScale[h] + mQueryZeroPoint[h]);
+                sumQ += quant_res;
+                *((int8_t*)pack_q + out_index * UP_DIV(mHeadDim, lP8) * eP8 * lP8 + a * eP8 * lP8 + in_index * lP8 + b) = quant_res;
+            }
+            *((float*)sum_q + out_index * eP8 + in_index) = sumQ * mQueryScale[h];
+        }
+    }
+    else {
+        T * query_src = query->host<T>();
+        T * query_dst = reinterpret_cast<T*>(pack_q);    
+        for (int i = 0; i < seq_len; i++) {
+            int out_index = i / eP;
+            int in_index  = i % eP;
+            for (int j = 0; j < mHeadDim; j++) {
+                query_dst[out_index * mHeadDim * eP + j * eP + in_index] = query_src[i * mNumHead * mHeadDim + h * mHeadDim + j] * q_scale;
+            }
         }
     }
 }
 
 template <typename T>
-static void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len, int unit) {
+void CPUAttention::unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len) {
     float * dst = unpack_qk_dst;
-    T * src = (T *)(pack_qk_src);
+    T * src = (T *)(pack_qk_src);    
     // [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len]
     for (int i = 0; i < seq_len; i++) {
         for (int j = 0; j < kv_seq_len; j++) {
@@ -119,6 +148,11 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
     mThreadNum = ((CPUBackend *)backend())->threadNumber();
     unit  = core->pack;
     bytes = core->bytes;
+    int qkvQuantOptions = static_cast<CPUBackend *>(backend())->getRuntime()->hint().qkvQuantOption;
+    mUseGemmInt8 = (qkvQuantOptions == 4);
+    if (mUseGemmInt8) {
+        static_cast<CPUBackend*>(backend())->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
+    }
     auto query = inputs[0];
     auto key   = inputs[1];
     int seq_len = query->shape()[1];
@@ -126,12 +160,28 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
     mHeadDim = query->shape()[3];
     mKvNumHead = key->shape()[2];
     mKVCacheManager->onResize(mKvNumHead, mHeadDim);
-    mPackQ.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), mHeadDim, eP}));
-    mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
-    backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
-    backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
-    backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
-    backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);
+    if (mUseGemmInt8) {
+        mPackQ.reset(Tensor::createDevice<int8_t>({mThreadNum, UP_DIV(seq_len, eP8), UP_DIV(mHeadDim, lP8), eP8 * lP8}));
+        mSumQ.reset(Tensor::createDevice<int32_t>({mThreadNum, UP_DIV(seq_len, eP8), eP8}));
+        mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
+        backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onAcquireBuffer(mSumQ.get(), Backend::DYNAMIC);
+        backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mSumQ.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);
+        mMinQ.resize(mNumHead);
+        mMaxQ.resize(mNumHead);
+        mQueryScale.resize(mNumHead);
+        mQueryZeroPoint.resize(mNumHead);
+    } else {
+        mPackQ.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), mHeadDim, eP}));
+        mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
+        backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);    
+    }
     return NO_ERROR;
 }
 
@@ -179,12 +229,12 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     // Temporary tensors for intermediate results
     std::shared_ptr<Tensor> packQK(Tensor::createDevice<float>({mThreadNum, UP_DIV(kv_seq_len, unit), seq_len, unit}));
     std::shared_ptr<Tensor> unpackQK(Tensor::createDevice<int32_t>({mThreadNum, seq_len, kv_seq_len}));
-    std::shared_ptr<Tensor> softmaxQK(Tensor::createDevice<int>({mThreadNum, seq_len, kv_seq_len}));
+    std::shared_ptr<Tensor> softmMaxQ(Tensor::createDevice<int32_t>({mThreadNum, seq_len, kv_seq_len}));
     std::shared_ptr<Tensor> newPackQK(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), kv_seq_len, eP}));
     std::shared_ptr<Tensor> dequantV(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), kv_seq_len, hP}));
     backend()->onAcquireBuffer(packQK.get(), Backend::STATIC);
     backend()->onAcquireBuffer(unpackQK.get(), Backend::STATIC);
-    backend()->onAcquireBuffer(softmaxQK.get(), Backend::STATIC);
+    backend()->onAcquireBuffer(softmMaxQ.get(), Backend::STATIC);
     backend()->onAcquireBuffer(newPackQK.get(), Backend::STATIC);
     if (quant_value) {
         backend()->onAcquireBuffer(dequantV.get(), Backend::STATIC);
@@ -194,48 +244,100 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     std::function<void(int)> mCompute = [=](int tId) {
         auto pack_q      = mPackQ->host<char>() + tId * UP_DIV(seq_len, eP) * mHeadDim * eP * bytes;
         auto pack_qk     = packQK->host<char>() + tId * UP_DIV(kv_seq_len, unit) * seq_len * unit * bytes;
+        char * sum_q     = nullptr;
         auto unpack_qk   = unpackQK->host<float>() + tId * seq_len * kv_seq_len;
-        auto softmax_qk  = softmaxQK->host<float>() + tId * seq_len * kv_seq_len;
+        auto softmax_qk  = softmMaxQ->host<float>() + tId * seq_len * kv_seq_len;
         auto new_pack_qk = newPackQK->host<char>() + tId * UP_DIV(seq_len, eP) * kv_seq_len * eP * bytes;
         auto pack_qkv    = mPackQKV->host<char>() + tId * UP_DIV(mHeadDim, unit) * seq_len * unit * bytes;
         auto QxK         = quant_key ? core->MNNPackedMatMul_int8 : core->MNNPackedMatMul;
         auto QxK_remain  = quant_key ? core->MNNPackedMatMulRemain_int8 : core->MNNPackedMatMulRemain;
         int  head_index  = tId * tileCount;
+        if (mUseGemmInt8) {
+            pack_q  = mPackQ->host<char>() + tId * UP_DIV(seq_len, eP8) * UP_DIV(mHeadDim, lP8) * eP8 * lP8;
+            sum_q   = mSumQ->host<char>() + tId * UP_DIV(seq_len, eP8) * eP8 * 4;
+        }
         for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) {
             int    kv_h            = h / group_size;
             char * key_addr        = mKVCacheManager->addrOfKey(kv_h);
-            char * scale_addr      = quant_key ? mKVCacheManager->addrOfScale(kv_h) : nullptr;
-            char * zero_point_addr = quant_key ? mKVCacheManager->addrOfZeroPoint(kv_h) : nullptr;
-            char * value_addr      = quant_value ? dequantV->host<char>() + kv_h * UP_DIV(mHeadDim, hP) * kv_seq_len * hP * bytes : mKVCacheManager->addrOfValue(kv_h);
+            char * scale_addr      = mKVCacheManager->addrOfScale(kv_h);
+            char * zero_point_addr = mKVCacheManager->addrOfZeroPoint(kv_h);
+            char * key_sum_addr    = mKVCacheManager->addrOfKeySum(kv_h);
+            char * value_addr      = quant_value ? (dequantV->host<char>() + kv_h * UP_DIV(mHeadDim, hP) * kv_seq_len * hP * bytes) : mKVCacheManager->addrOfValue(kv_h);
             if (bytes == 2) {
-                pack_query<FLOAT16_T>(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale);
+                pack_query<FLOAT16_T>(query, pack_q, sum_q, seq_len, h, q_scale);
             } else {
-                pack_query<float>(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale);
+                pack_query<float>(query, pack_q, sum_q, seq_len, h, q_scale);
             }
             // query @ key
-            int loop_e = seq_len / eP;
-            int remain = seq_len % eP;
-            size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)mHeadDim, (size_t)kv_seq_len, (size_t)seq_len * unit * bytes, 0, 0, 0};
-            for (int i = 0 ; i < loop_e; i++) {
-                QxK((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_addr, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
+            if (mUseGemmInt8) {
+                auto GemmInt8Kernel = static_cast<CPUBackend*>(backend())->int8Functions()->Int8GemmKernel;
+                if (bytes == 2 && unit == 8) {
+                    GemmInt8Kernel = static_cast<CPUBackend*>(backend())->int8Functions()->MNNGemmInt8AddBiasScale_Unit_FP16;
+                }
+                std::vector<float> postScale(ROUND_UP(kv_seq_len, hP8), 0.0f);
+                for (int i = 0; i < kv_seq_len; i++) {
+                    postScale[i] = ((float*)scale_addr)[i] * mQueryScale[h] * q_scale;
+                }
+                std::vector<float> weightQuantBias(ROUND_UP(kv_seq_len, hP8), 0.0f);
+                for (int i = 0; i < kv_seq_len; i++) {
+                    weightQuantBias[i] = -((float*)scale_addr)[i] * ((float*)zero_point_addr)[i] * q_scale;
+                }
+                std::vector<float> biasFloat(ROUND_UP(kv_seq_len, hP8), 0.0f);
+                for (int i = 0; i < kv_seq_len; i++) {
+                    biasFloat[i] = -mQueryScale[h] * mQueryZeroPoint[h] * ((float*)key_sum_addr)[i] * q_scale;
+                }
+                QuanPostTreatParameters post;
+                post.bias = nullptr;
+                post.biasFloat = biasFloat.data();
+                post.blockNum = 1;
+                post.extraBias = nullptr;
+                post.extraScale = nullptr;
+                post.fp32minmax = nullptr;
+                post.scale = postScale.data();
+                post.useInt8 = false;
+                post.weightQuanBias = weightQuantBias.data();
+                int N = UP_DIV(seq_len, eP8);
+                for (int i = 0; i < N; i++) {
+                    int realcount = ALIMIN(eP8, seq_len - i * eP8);
+                    post.srcKernelSum = (float*)((char*)sum_q + i * eP8 * 4);
+                    GemmInt8Kernel(
+                        (int8_t*)pack_qk + i * eP8 * unit * bytes,
+                        (int8_t*)pack_q + i * ROUND_UP(mHeadDim, lP8) * eP8,
+                        (int8_t*)key_addr,
+                        UP_DIV(mHeadDim, lP8),
+                        seq_len * unit * bytes,
+                        UP_DIV(kv_seq_len, unit),
+                        &post,
+                        realcount
+                    );
+                }
+            }
+            else {
+                int loop_e = seq_len / eP;
+                int remain = seq_len % eP;
+                size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)mHeadDim, (size_t)kv_seq_len, (size_t)seq_len * unit * bytes, 0, 0, 0};
+                for (int i = 0 ; i < loop_e; i++) {
+                    QxK((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_addr, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
+                }
+                QxK_remain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_addr, remain, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
             }
-            QxK_remain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_addr, remain, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
             // qk: [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len] -> [seq_len/eP, kv_seq_len, eP]
             if(bytes == 2) {
-                unpack_QK<FLOAT16_T>(unpack_qk, pack_qk, seq_len, kv_seq_len, unit);
+                unpack_QK<FLOAT16_T>(unpack_qk, pack_qk, seq_len, kv_seq_len);
                 mask_QK<FLOAT16_T>(unpack_qk, seq_len, kv_seq_len, mScale, std::numeric_limits<float>::lowest(), mask->host<int>(), float_mask);
                 softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len);
                 pack_QK<FLOAT16_T>(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP);
             } else {
-                unpack_QK<float>(unpack_qk, pack_qk, seq_len, kv_seq_len, unit);
+                unpack_QK<float>(unpack_qk, pack_qk, seq_len, kv_seq_len);
                 mask_QK<float>(unpack_qk, seq_len, kv_seq_len, mScale, std::numeric_limits<float>::lowest(), mask->host<int>(), float_mask);
                 softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len);
                 pack_QK<float>(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP);
             }
             // qk @ v
-            shapeParameters[1] = kv_seq_len;
-            shapeParameters[2] = mHeadDim;
+            size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)kv_seq_len, (size_t)mHeadDim, (size_t)seq_len * unit * bytes, 0, 0, 0};
             shapeParameters[5] = quant_value ? 0 : (max_len - kv_seq_len) * hP * bytes;
+            int loop_e = seq_len / eP;
+            int remain = seq_len % eP;
             for (int i = 0 ; i < loop_e; i++) {
                 core->MNNPackedMatMul((float*)(pack_qkv + (i * eP * unit) * bytes), (float*)(new_pack_qk + (i * kv_seq_len * eP) * bytes), (float*)value_addr, shapeParameters, nullptr, nullptr, nullptr, nullptr);
             }
@@ -257,7 +359,7 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
 
     backend()->onReleaseBuffer(packQK.get(), Backend::STATIC);
     backend()->onReleaseBuffer(unpackQK.get(), Backend::STATIC);
-    backend()->onReleaseBuffer(softmaxQK.get(), Backend::STATIC);
+    backend()->onReleaseBuffer(softmMaxQ.get(), Backend::STATIC);
     backend()->onReleaseBuffer(newPackQK.get(), Backend::STATIC);
     if (quant_value){
         backend()->onReleaseBuffer(dequantV.get(), Backend::STATIC);
@@ -277,10 +379,13 @@ bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) {
 
 CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend), mKVCache(kv_cache) {
     if (mKVCache) {
+        mPackQ.reset(Tensor::createDevice<float>({1, 1, 1, 1}));
+        mPackQKV.reset(Tensor::createDevice<float>({1, 1, 1, 1}));
         MNN::KVCacheManager::KVCacheConfig kvconfig;
-        int kvcacheQuantOptions = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheQuantOption;
-        kvconfig.mQuantKey   = (kvcacheQuantOptions & 1);
-        kvconfig.mQuantValue = ((kvcacheQuantOptions >> 1) & 1);
+        int qkvQuantOptions = static_cast<CPUBackend *>(backend)->getRuntime()->hint().qkvQuantOption;
+        kvconfig.mUseInt8Kernel = (qkvQuantOptions == 4);
+        kvconfig.mQuantKey   = (qkvQuantOptions == 4) || (qkvQuantOptions & 1);
+        kvconfig.mQuantValue = (qkvQuantOptions == 4) || ((qkvQuantOptions >> 1) & 1);
         kvconfig.mKVCacheDir = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheDirPath;
         kvconfig.mKVCacheSizeLimit = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheSizeLimit;
         kvconfig.mExpandChunk = 64;
@@ -305,4 +410,4 @@ REGISTER_CPU_OP_CREATOR_TRANSFORMER(CPUAttentionCreator, OpType_Attention);
 
 } // namespace MNN
 
-#endif // MNN_SUPPORT_TRANSFORMER_FUSE
+#endif // MNN_SUPPORT_TRANSFORMER_FUSE

+ 7 - 2
source/backend/cpu/CPUAttention.hpp

@@ -29,12 +29,17 @@ private:
     bool mIsPrefill      = true;
     bool mIsFirstPrefill = true;
     bool mKVCache        = true;
+    bool mUseGemmInt8    = false;
     int bytes = 4;
     int mThreadNum = 1;;
-    int eP, lP, hP, unit;
+    int eP, lP, hP, unit; // float matmul packing
+    int eP8, lP8, hP8;    // GemmInt8 packing
     int mNumHead, mKvNumHead, mHeadDim;
-    std::shared_ptr<Tensor> mPackQ, mPackQKV;
+    std::shared_ptr<Tensor> mPackQ, mPackQKV, mSumQ;
     std::shared_ptr<KVCacheManager> mKVCacheManager = nullptr;
+    std::vector<float> mMinQ, mMaxQ, mQueryScale, mQueryZeroPoint;
+    template <typename T> void pack_query(Tensor* query, char* pack_q, char* sum_q, int seq_len, int h, float q_scale);
+    template <typename T> void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len);
 };
 
 } // namespace MNN

+ 112 - 43
source/backend/cpu/CPUBackend.cpp

@@ -37,6 +37,7 @@
 #include "x86_x64/AVX2Backend.hpp"
 #endif
 
+#define MNN_CPU_MAX_BUFFER_INDEX 2
 #define MNN_CPU_CHECK_NAN 1
 #define MNN_CPU_USE_DEFAULT_BACKEND 4
 namespace MNN {
@@ -208,7 +209,12 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
 }
 
 CPURuntime::CPURuntime(const Backend::Info& info) {
-    mStaticAllocator.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createDefault()));
+    auto rawAlloc = BufferAllocator::Allocator::createDefault();
+    mStaticAllocator.reset(new EagerBufferAllocator(rawAlloc));
+    mDynamic.resize(MNN_CPU_MAX_BUFFER_INDEX);
+    for (auto& buf : mDynamic) {
+        buf.root = rawAlloc;
+    }
     mThreadNumber = info.numThread;
     mPower   = BackendConfig::Power_Normal;
     mMemory  = BackendConfig::Memory_Normal;
@@ -231,17 +237,49 @@ CPURuntime:: ~ CPURuntime() {
 }
 float CPURuntime::onGetMemoryInMB() {
     auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
-    return staticMemoryInMB;
+    float dynamicMemoryInMB = 0.0f;
+    for (auto& buf : mDynamic) {
+        dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
+    }
+    return staticMemoryInMB + dynamicMemoryInMB;
 }
 bool CPURuntime::onCheckInfo(Backend::Info& info) const {
     info.numThread = mThreadNumber;
     return true;
 }
+SingleBufferWithAllocator* CPURuntime::buffer(int index) const {
+    if (mDynamicMmap.empty()) {
+        return mDynamic.data() + index;
+    }
+    return mDynamicMmap.data() + index;
+}
 
-Backend* CPURuntime::onCreate(const BackendConfig* config) const {
+Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) const {
+    if (hint().midMemoryPath.size() > 0) {
+        if (mDynamicMmap.empty()) {
+            // Only support set featuremap dir once
+            mDynamicMmap.resize(2);
+            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().midMemoryPath.c_str(), "dynamic");
+            for (auto& buf : mDynamicMmap) {
+                buf.root = mmapMem;
+            }
+        }
+    }
+    if (hint().weightMemoryPath.size() > 0) {
+        if (nullptr == mStaticAllocatorCache.get()) {
+            // Only support set weightmap dir once
+            mStaticAllocatorCache = mStaticAllocator;
+            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "static");
+            mStaticAllocator.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
+        }
+    }
     auto precision = mPrecision;
     auto memory = mMemory;
     size_t flags = mFlags;
+    if (nullptr != origin) {
+        auto cpuBn = static_cast<CPUBackend*>(origin);
+        mSharedDmaInfo = cpuBn->mDmaInfo;
+    }
     _resetGroupCompute();
     if (nullptr != config) {
         precision = config->precision;
@@ -251,30 +289,36 @@ Backend* CPURuntime::onCreate(const BackendConfig* config) const {
 #ifdef LOG_VERBOSE
     MNN_PRINT("cpu backend was created by runtime:%p\n", this);
 #endif
-
+    CPUBackend* res = nullptr;
+    do {
 #ifdef MNN_USE_ARMV82
-    auto core = MNNGetCoreFunctions();
-    if (core->supportFp16arith && precision == BackendConfig::Precision_Low) {
-        return new Arm82Backend(this, memory);
-    }
+        auto core = MNNGetCoreFunctions();
+        if (core->supportFp16arith && precision == BackendConfig::Precision_Low) {
+            res = new Arm82Backend(this, memory);
+            break;
+        }
 #endif
 #ifdef MNN_SUPPORT_BF16
-    if (precision == BackendConfig::Precision_Low_BF16 && BF16Functions::get()) {
-        auto res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU_EXTENSION, 0);
-        res->mCoreFunctions = BF16Functions::get();
-        return res;
-    }
+        if (precision == BackendConfig::Precision_Low_BF16 && BF16Functions::get()) {
+            res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU_EXTENSION, 0);
+            res->mCoreFunctions = BF16Functions::get();
+            break;
+        }
 #endif
-    if (flags == MNN_CPU_USE_DEFAULT_BACKEND) {
-        return new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, 0);
-    }
+        if (flags == MNN_CPU_USE_DEFAULT_BACKEND) {
+            res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, 0);
+            break;
+        }
 #ifdef MNN_USE_SSE
-    if (AVX2Backend::isValid()) {
-        return new AVX2Backend(this, memory, flags);
-    }
+        if (AVX2Backend::isValid()) {
+            res = new AVX2Backend(this, memory, flags);
+            break;
+        }
 #endif
-
-    return new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, flags);
+        res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, flags);
+    } while (false);
+    mSharedDmaInfo = nullptr;
+    return res;
 }
 
 int CPURuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const {
@@ -298,6 +342,11 @@ int CPURuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const {
 
 void CPURuntime::onGabageCollect(int level) {
     mStaticAllocator->release(false);
+    if (level >= 100) {
+        for (auto& buf : mDynamic) {
+            buf.release();
+        }
+    }
 }
 
 
@@ -339,25 +388,34 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
     map->insert(std::make_pair(t, c));
     return true;
 }
-
+BufferAllocator* CPURuntime::createDynamicBufferAlloctor(int index) const {
+    if (hint().memoryAllocatorType == Runtime::Allocator_Defer) {
+        return new DeferBufferAllocator(buffer(index));
+    }
+    if (nullptr != mStaticAllocatorCache.get()) {
+        return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocatorCache.get()));
+    }
+    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get()));
+}
 CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags) : Backend(type) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("cpu backend create\n");
 #endif
     mMemory = memory;
     mRuntime = const_cast<CPURuntime*>(runtime);
-    std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
-    if (mRuntime->hint().memoryAllocatorType == Runtime::Allocator_Defer) {
-        mDynamicAllocator.reset(new DeferBufferAllocator(defaultAlloc));
+    auto dynamicAlloc = mRuntime->mSharedDmaInfo;
+    if (nullptr == dynamicAlloc.get()) {
+        mDmaInfo.reset(new CPURuntime::DynamicAllocator);
+        mDmaInfo->mDynamicAllocator.reset(mRuntime->createDynamicBufferAlloctor(0));
+        mDmaInfo->mCurrentDynamicAllocator = mDmaInfo->mDynamicAllocator.get();
     } else {
-        mDynamicAllocator.reset(new EagerBufferAllocator(defaultAlloc));
+        mDmaInfo = dynamicAlloc;
     }
-    mCurrentDynamicAllocator = mDynamicAllocator.get();
     mStaticAllocator = runtime->mStaticAllocator;
     mPrecisionMode = precision;
     mCoreFunctions = MNNGetCoreFunctions();
     mInt8CoreFunctions = MNNGetInt8CoreFunctions();
-    mCacheGroup.resize(2);
+    mCacheGroup.resize(MNN_CPU_MAX_BUFFER_INDEX);
     for (int i=0; i<mCacheGroup.size(); ++i) {
         mCacheGroup[i].reset(new CPUResizeCache);
     }
@@ -367,8 +425,15 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
 CPUBackend::~CPUBackend() {
     mCacheGroup.clear();
 }
+void CPUBackend::_resetDynamicMemory() const {
+    mDmaInfo->mDynamicAllocator->apply();
+    if (nullptr != mDmaInfo->mDynamicAllocatorBackup.get()) {
+        mDmaInfo->mDynamicAllocatorBackup->apply();
+    }
+}
 
 void CPUBackend::onExecuteBegin() const {
+    _resetDynamicMemory();
     mRuntime->onConcurrencyBegin();
 }
 
@@ -377,23 +442,20 @@ void CPUBackend::onExecuteEnd() const {
 }
 
 void CPUBackend::onResizeBegin() {
-    mCurrentDynamicAllocator->reset();
+    mDmaInfo->mCurrentDynamicAllocator->reset();
 }
 bool CPUBackend::onSelectDynamicAllocator(int index, int maxIndex) {
     if (maxIndex > 2) {
         return false;
     }
-    if (maxIndex == 2 && mDynamicAllocatorBackup.get() == nullptr) {
-        if (mRuntime->hint().memoryAllocatorType == Runtime::Allocator_Defer) {
-            mDynamicAllocatorBackup.reset(new DeferBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get())));
-        } else {
-            mDynamicAllocatorBackup.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get())));
-        }
+    if (maxIndex == 2 && mDmaInfo->mDynamicAllocatorBackup.get() == nullptr) {
+        mDmaInfo->mDynamicAllocatorBackup.reset(mRuntime->createDynamicBufferAlloctor(1));
     }
     if (1 == index) {
-        mCurrentDynamicAllocator = mDynamicAllocatorBackup.get();
+        mDmaInfo->mCurrentDynamicAllocator = mDmaInfo->mDynamicAllocatorBackup.get();
     } else {
-        mCurrentDynamicAllocator = mDynamicAllocator.get();
+        mRuntime->buffer(0)->release();
+        mDmaInfo->mCurrentDynamicAllocator = mDmaInfo->mDynamicAllocator.get();
     }
     mCache = mCacheGroup[index].get();
     return true;
@@ -401,7 +463,11 @@ bool CPUBackend::onSelectDynamicAllocator(int index, int maxIndex) {
 
 ErrorCode CPUBackend::onResizeEnd() {
     getCache()->release();
-    return mCurrentDynamicAllocator->compute();
+    auto code = mDmaInfo->mCurrentDynamicAllocator->compute();
+    if (NO_ERROR != code) {
+        return code;
+    }
+    return NO_ERROR;
 }
 
 Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType storageType) {
@@ -431,11 +497,11 @@ Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType
             break;
         }
         case DYNAMIC: {
-            chunk = mCurrentDynamicAllocator->alloc(size, false);
+            chunk = mDmaInfo->mCurrentDynamicAllocator->alloc(size, false);
             break;
         }
         case DYNAMIC_SEPERATE: {
-            chunk = mCurrentDynamicAllocator->alloc(size, true);
+            chunk = mDmaInfo->mCurrentDynamicAllocator->alloc(size, true);
             break;
         }
         default:
@@ -453,7 +519,7 @@ Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType
     if (storageType == STATIC) {
         res = new CPUMemObj(mStaticAllocator.get(), chunk, size);
     } else {
-        res = new CPUMemObj(mCurrentDynamicAllocator, chunk, size);
+        res = new CPUMemObj(mDmaInfo->mCurrentDynamicAllocator, chunk, size);
         chunk.attach(dest);
     }
     if (chunk.ptr()) {
@@ -591,8 +657,11 @@ const Runtime* CPUBackend::getRuntime() {
 }
 
 bool CPUBackend::onClearBuffer() {
+    if (nullptr != mRuntime->mStaticAllocatorCache.get()) {
+        mStaticAllocator = mRuntime->mStaticAllocatorCache;
+    }
     mCache->reset();
-    mCurrentDynamicAllocator->release(true);
+    mDmaInfo->mCurrentDynamicAllocator->release(true);
     return true;
 }
 
@@ -606,9 +675,9 @@ std::pair<int, int> CPUBackend::multiThreadDivide(int size) const {
     return std::make_pair(sizeDivide, scheduleNumber);
 }
 void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
+    _resetDynamicMemory();
     auto& srcBuffer = srcTensor->buffer();
     auto& dstBuffer = dstTensor->buffer();
-
     if (srcBuffer.dimensions != dstBuffer.dimensions ) {
         if (srcBuffer.dim[srcBuffer.dimensions - 1].extent != 1 && dstBuffer.dim[dstBuffer.dimensions - 1].extent != 1) {
             MNN_ERROR("srcBuffer dimension not equal to dstBuffer, can't copy buffer\n");

+ 17 - 7
source/backend/cpu/CPUBackend.hpp

@@ -20,11 +20,16 @@
 namespace MNN {
 class CPURuntime : public Runtime {
 public:
+    struct DynamicAllocator {
+        std::shared_ptr<BufferAllocator> mDynamicAllocator;
+        std::shared_ptr<BufferAllocator> mDynamicAllocatorBackup;
+        BufferAllocator* mCurrentDynamicAllocator = nullptr;
+    };
     friend class CPUBackend;
     CPURuntime(const Backend::Info& info);
     virtual ~ CPURuntime();
     int onGetRuntimeStatus(RuntimeStatus statusEnum) const override;
-    virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual Backend* onCreate(const BackendConfig* config, Backend* origin) const override;
     virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
@@ -43,10 +48,13 @@ public:
         return mThreadOpen;
     }
 #endif
+    SingleBufferWithAllocator* buffer(int index) const;
+    BufferAllocator* createDynamicBufferAlloctor(int index) const;
+
 private:
     void _bindCPUCore() const;
     void _resetThreadPool();
-    std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
+    mutable std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
     int mThreadNumber;
 #ifdef MNN_USE_THREAD_POOL
     mutable int mTaskIndex = -1;
@@ -64,6 +72,10 @@ private:
     static Backend*(*gExtraCreate)(const Runtime* runtime);
     size_t mFlags = 0;
     mutable int mCurrentTID = 0;
+    mutable std::vector<SingleBufferWithAllocator> mDynamic;
+    mutable std::vector<SingleBufferWithAllocator> mDynamicMmap;
+    mutable std::shared_ptr<DynamicAllocator> mSharedDmaInfo;
+    mutable std::shared_ptr<EagerBufferAllocator> mStaticAllocatorCache;
 };
 struct CoreFunctions;
 struct CoreInt8Functions;
@@ -122,6 +134,7 @@ public:
     const CoreInt8Functions* int8Functions() const {
         return mInt8CoreFunctions;
     }
+    void _resetDynamicMemory() const;
 public:
     class Creator {
     public:
@@ -141,7 +154,7 @@ public:
 #endif
 
     BufferAllocator* getBufferAllocator(bool defer_allocator = true) const {
-        return mCurrentDynamicAllocator;
+        return mDmaInfo->mCurrentDynamicAllocator;
     }
 
     BackendConfig::MemoryMode memoryMode() const {
@@ -164,22 +177,19 @@ public:
     static DataType getDataType(const Tensor* tensor);
     friend class CPURuntime;
 
-
 protected:
     MemObj* allocBuffer(size_t size, Tensor* dest,  StorageType storageType);
     CoreFunctions* mCoreFunctions;
     CoreInt8Functions* mInt8CoreFunctions;
 private:
+    std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
     std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
-    std::shared_ptr<BufferAllocator> mDynamicAllocator;
-    std::shared_ptr<BufferAllocator> mDynamicAllocatorBackup;
     CPURuntime* mRuntime;
     BackendConfig::PrecisionMode mPrecisionMode;
     BackendConfig::MemoryMode mMemory;
     static std::map<OpType, CPUBackend::Creator*>* gCreator;
     CPUResizeCache* mCache;
     std::vector<std::shared_ptr<CPUResizeCache>> mCacheGroup;
-    BufferAllocator* mCurrentDynamicAllocator = nullptr;
 };
 /** execution cast wrapper. insert tensor cast dynamic. */
 class CastWrapExecution : public Execution {

+ 2 - 3
source/backend/cpu/CPUCast.cpp

@@ -21,13 +21,12 @@ ErrorCode CPUCastCreator::cast(const void* inputRaw, void* outputRaw, ConvertTyp
     int remain = number % pack;
     if (type == FlOAT_TO_INT8) {
         scale = (scale == 0.f ? 0.f : 1.f / scale);
-        std::vector<float> scales(pack, scale);
-        bn->int8Functions()->MNNFloat2Int8((float*)(inputRaw), (int8_t*)(outputRaw), c4Size, scales.data(), min, max, zero);
+        bn->int8Functions()->MNNFloat2Int8((float*)(inputRaw), (int8_t*)(outputRaw), c4Size, &scale, min, max, &zero, 0);
         if (remain > 0) {
             std::vector<float> tempSrc(pack);
             std::vector<int8_t> tempDst(pack);
             ::memcpy(tempSrc.data(), (float*)(inputRaw) + c4Size * pack, remain * sizeof(float));
-            bn->int8Functions()->MNNFloat2Int8(tempSrc.data(), tempDst.data(), 1, scales.data(), min, max, zero);
+            bn->int8Functions()->MNNFloat2Int8(tempSrc.data(), tempDst.data(), 1, &scale, min, max, &zero, 0);
             ::memcpy(static_cast<int8_t*>(outputRaw) + c4Size * pack, tempDst.data(), remain * sizeof(int8_t));
         }
         return NO_ERROR;

+ 0 - 170
source/backend/cpu/CPUConvolution.cpp

@@ -117,7 +117,6 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
         return;
     }
 
-    int size = mResource->mOutputCount;
     const int kernelNum = static_cast<int>(mResource->mInt8WeightKernelSum.size());
     auto biasData    = mResource->mOriginBias->host<float>();
     auto alphaData   = mResource->mOriginScale->host<float>();
@@ -189,7 +188,6 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
     const int8_t* weightSrc = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    resource->mOutputCount = outputCount;
     if (!ConvolutionCommon::getConvInt8Parameters(op, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr, betaPtr)) {
         return nullptr;
     }
@@ -254,174 +252,6 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
     return resource;
 }
 
-void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> resource, const MNN::Op *op, std::shared_ptr<ResourceInt8> resourceInt8) {
-    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
-    auto conv2d = op->main_as_Convolution2D();
-    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
-    MNN_ASSERT(quanBuffer || resourceInt8);
-    resource->backend = backend;
-    auto core = static_cast<CPUBackend*>(backend)->functions();
-    // common parameters
-    int outputCount = conv2d->common()->outputCount();
-    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
-    int ocUp4 = ROUND_UP(outputCount, core->pack);
-    int8_t* weightOrigin;
-
-    // Save weight quant scale and bias: wf=scale*wi+bias
-    resource->mDequantize.mScaleBias.reset(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
-    auto success = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc denquant scaleBias memory error\n");
-        return;
-    }
-    auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
-    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
-    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
-
-    std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon;
-    // Load quant scale and bias
-    if (quanBuffer) {
-        quantCommon = ConvolutionCommon::load(op, backend, false, true);
-        weightOrigin = quantCommon->weight.get(); // weight before reorder
-
-        int h = quantCommon->alpha.size();
-        if (core->bytes == 2) {
-            if (quantCommon->asymmetric) {
-                std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-                core->MNNFp32ToLowp(quantCommon->alpha.get(), tmp.get(), h);
-                for (int i=0; i< h/2; ++i) {
-                    reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[2 * i + 1];
-                    reinterpret_cast<int16_t*>(biasPtr)[i] = tmp[2 * i];
-                }
-            } else {
-                core->MNNFp32ToLowp(quantCommon->alpha.get(), reinterpret_cast<int16_t*>(alphaPtr), h);
-            }
-        } else {
-            if (quantCommon->asymmetric) {
-                h = h / 2;
-                for (int i=0; i<h; ++i) {
-                    alphaPtr[i] = quantCommon->alpha.get()[2 * i + 1];
-                    biasPtr[i] = quantCommon->alpha.get()[2 * i];
-                }
-            } else {
-                for (int i=0; i<h; ++i) {
-                    alphaPtr[i] = quantCommon->alpha.get()[i];
-                    biasPtr[i] = 0.f;
-                }
-            }
-        }
-    } else {
-        weightOrigin = resourceInt8->mWeightInt8->host<int8_t>();
-        auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
-        auto wScale = resourceInt8->mOriginScale->host<float>();
-        int h = ocUp4;
-        if (core->bytes == 2) {
-            std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-            core->MNNFp32ToLowp(wScale, tmp.get(), h);
-            for (int i=0; i< h; ++i) {
-                reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
-                reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
-            }
-        } else {
-            for (int i=0; i< h; ++i) {
-                alphaPtr[i] = wScale[i];
-                biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
-            }
-        }
-    }
-
-    // Compute float weightKernelSum
-    resource->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
-    success = resource->backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc denquant mWeightKernelSum memory error\n");
-        return;
-    }
-    auto weightKernelSum = resource->mWeightKernelSum->host<float>();
-    for (int i = 0; i < outputCount; ++i) {
-        int sum = 0;
-        for (int j = 0; j < LSize; ++j) {
-            sum = sum + static_cast<int>(weightOrigin[j + i * LSize]);
-        }
-        if(core->bytes == 2) {
-            auto scale = reinterpret_cast<int16_t*>(alphaPtr)[i];
-            auto bias = reinterpret_cast<int16_t*>(biasPtr)[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        } else {
-            auto scale = alphaPtr[i];
-            auto bias = biasPtr[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        }
-    }
-}
-
-void CPUConvolution::makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8) {
-    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
-    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
-    MNN_ASSERT(quanBuffer || resourceInt8);
-    auto core = static_cast<CPUBackend*>(backend)->functions();
-    // common parameters
-    int outputCount = conv2d->common()->outputCount();
-    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
-    int ocUp4 = ROUND_UP(outputCount, core->pack);
-    int8_t* weightOrigin;
-
-    // Save weight quant scale and bias: wf=scale*wi+bias
-    std::shared_ptr<Tensor> scaleBias(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
-    auto success = backend->onAcquireBuffer(scaleBias.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc dequant scaleBias memory error\n");
-        return;
-    }
-    auto alphaPtr = scaleBias->host<float>();
-    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
-    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
-    
-    // Load quant scale and bias
-    weightOrigin = resourceInt8->mWeightInt8->host<int8_t>();
-    auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
-    auto wScale = resourceInt8->mOriginScale->host<float>();
-    int h = ocUp4;
-    if (core->bytes == 2) {
-        std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-        core->MNNFp32ToLowp(wScale, tmp.get(), h);
-        for (int i=0; i< h; ++i) {
-            reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
-            reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
-        }
-    } else {
-        for (int i=0; i< h; ++i) {
-            alphaPtr[i] = wScale[i];
-            biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
-        }
-    }
-    resourceInt8->mOriginScale = scaleBias;
-    
-    // Compute float weightKernelSum
-    resourceInt8->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
-    success = backend->onAcquireBuffer(resourceInt8->mWeightKernelSum.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc dequant mWeightKernelSum memory error\n");
-        return;
-    }
-    auto weightKernelSum = resourceInt8->mWeightKernelSum->host<float>();
-    for (int i = 0; i < outputCount; ++i) {
-        int sum = 0;
-        for (int j = 0; j < LSize; ++j) {
-            sum = sum + static_cast<int>(weightOrigin[j + i * LSize]);
-        }
-        if(core->bytes == 2) {
-            auto scale = reinterpret_cast<int16_t*>(alphaPtr)[i];
-            auto bias = reinterpret_cast<int16_t*>(biasPtr)[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        } else {
-            auto scale = alphaPtr[i];
-            auto bias = biasPtr[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        }
-    }
-}
-
 CPUConvolution::CPUConvolution(const Convolution2DCommon *convOp, Backend *b) : MNN::Execution(b), mCommon(convOp) {
     // Do nothing
 }

+ 1 - 6
source/backend/cpu/CPUConvolution.hpp

@@ -69,12 +69,8 @@ public:
         bool mRelu;
         int mActBits;  // quant bits
 
-        int mOutputCount;
         bool mUseConvQuan = true;
         bool mWeightAsymmetricQuant = true;
-#ifdef MNN_USE_SSE
-        std::vector<int> offsets;
-#endif
         // Origin Attributes from net
         float mInputScale = 0.0f;
         float mOutputScale = 0.0f;
@@ -82,6 +78,7 @@ public:
         int32_t mOutputZeroPoint;
         int8_t mClampMin;
         int8_t mClampMax;
+        bool mDynamicQuant = false;
     };
     struct MutableResourceInt8 {
         MutableResourceInt8(std::shared_ptr<ResourceInt8> res, Backend* backend);
@@ -100,8 +97,6 @@ public:
         bool mValid;
     };
     static std::shared_ptr<ResourceInt8> makeResourceInt8(Backend *backend, const MNN::Op *op, int pack=4);
-    static void makeResource(Backend* backend, std::shared_ptr<Resource> resource, const MNN::Op *op, std::shared_ptr<ResourceInt8> resourceInt8 = nullptr);
-    static void makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8);
     CPUConvolution(const Convolution2DCommon *convOp, Backend *b);
     virtual ~CPUConvolution() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

+ 4 - 2
source/backend/cpu/CPUDeconvolution.cpp

@@ -346,7 +346,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     }
 
     mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
-                       strideX, threadNumber, src_width, src_height, plane, input, biasTensor, this, core, gcore, batch, outi8, scales,
+                       strideX, threadNumber, src_width, src_height, plane, input, biasTensor, this, core, gcore, batch, outi8, scale,
                        minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
         auto colBufferPtr = mTempOutput->host<uint8_t>();
         auto biasPtr      = biasTensor->host<float>();
@@ -391,7 +391,9 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
             }
             core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr +  unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
             if (outi8) {
-                gcore->MNNFloat2Int8((float*)dstZ, (int8_t*)(outputPtr + z * float2Int8_step * core->pack), float2Int8_step, scales.data(), minValue, maxValue, zeroPoint);
+                float scaleOne = scale;
+                float zeroOne  = zeroPoint;
+                gcore->MNNFloat2Int8((float*)dstZ, (int8_t*)(outputPtr + z * float2Int8_step * core->pack), float2Int8_step, &scaleOne, minValue, maxValue, &zeroOne, 0);
             }
         }
     }, threadNumber));

+ 1 - 1
source/backend/cpu/CPUDynamicQuant.cpp

@@ -46,7 +46,7 @@ ErrorCode CPUDynamicQuant::onExecute(const std::vector<Tensor*> &inputs,
     int pack = core->pack;
     std::vector<float> qsVec(pack, quantScale);
     int sizeDiv = UP_DIV(size, pack);
-    int8core->MNNFloat2Int8(inputPtr, outputPtr, sizeDiv, qsVec.data(), -128, 127, (ssize_t)zeroPoint);
+    int8core->MNNFloat2Int8(inputPtr, outputPtr, sizeDiv, &quantScale, -128, 127, &zeroPoint, 0);
     float* scale = outputs[1]->host<float>();
     float* zeros = outputs[2]->host<float>();
     *scale = dequantScale;

+ 2 - 2
source/backend/cpu/CPUFloatToInt8.cpp

@@ -36,7 +36,7 @@ CPUFloatToInt8::CPUFloatToInt8(Backend* backend, const MNN::Op* param) : Executi
         memcpy(mScales->host<float>(), scale->tensorScale()->data(), scaleLen * sizeof(float));
     }
 
-    mZeroPoint = scale->zeroPoint();
+    mZeroPoint = static_cast<float>(scale->zeroPoint());
     mClampMin = scale->clampMin();
     mClampMax = scale->clampMax();
 }
@@ -78,7 +78,7 @@ ErrorCode CPUFloatToInt8::onExecute(const std::vector<Tensor*>& inputs, const st
         const auto srcChannelPtr   = inputDataPtr + tId * oc4Stride * pack;
         const auto scaleChannelPtr = scaleDataPtr + z * pack;
         auto dstChannlePtr         = outputDataPtr + tId * oc4Stride * pack;
-        int8F->MNNFloat2Int8(srcChannelPtr, dstChannlePtr, oc4Stride, scaleChannelPtr, mClampMin, mClampMax, mZeroPoint);
+        int8F->MNNFloat2Int8(srcChannelPtr, dstChannlePtr, oc4Stride, scaleChannelPtr, mClampMin, mClampMax, &mZeroPoint, 1);
     }
     MNN_CONCURRENCY_END();
     return NO_ERROR;

+ 1 - 1
source/backend/cpu/CPUFloatToInt8.hpp

@@ -22,7 +22,7 @@ public:
 
 private:
     std::shared_ptr<Tensor> mScales;
-    int8_t mZeroPoint;
+    float mZeroPoint;
     int8_t mClampMin;
     int8_t mClampMax;
     int mClipBits;

+ 0 - 1
source/backend/cpu/CPUImageProcess.cpp

@@ -15,7 +15,6 @@
 #include <utility>
 
 namespace MNN {
-#define CACHE_SIZE 256
 
 ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto input = inputs[0];

+ 14 - 9
source/backend/cpu/CPUProposal.cpp

@@ -16,12 +16,17 @@
 #include <MNN/AutoTime.hpp>
 namespace MNN {
 
-CPUProposal::CPUProposal(Backend *backend, const Proposal *proposal) : Execution(backend), mProposal(proposal) {
-    auto ratioCount = mProposal->ratios()->float32s()->size();
-    auto numScale   = mProposal->scales()->float32s()->size();
+CPUProposal::CPUProposal(Backend *backend, const Proposal *proposal) : Execution(backend) {
+    auto ratioCount = proposal->ratios()->float32s()->size();
+    auto numScale   = proposal->scales()->float32s()->size();
     mAnchors.reset(4 * ratioCount * numScale);
+    mCache.featStride   = proposal->featStride();
+    mCache.preNmsTopN   = proposal->preNmsTopN();
+    mCache.nmsThreshold = proposal->nmsThreshold();
+    mCache.afterNmsTopN = proposal->afterNmsTopN();
+    mCache.minSize      = proposal->minSize();
 
-    auto baseSize = mProposal->baseSize();
+    auto baseSize = proposal->baseSize();
     const auto cx = baseSize * 0.5f;
     const auto cy = baseSize * 0.5f;
     auto ratios   = proposal->ratios()->float32s()->data();
@@ -117,11 +122,11 @@ ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std:
     auto score  = inputs[0];
     auto boxes  = inputs[1];
     auto imInfo = inputs[2];
-    auto featStride   = mProposal->featStride();
-    auto preNmsTopN   = mProposal->preNmsTopN();
-    auto nmsThreshold = mProposal->nmsThreshold();
-    auto afterNmsTopN = mProposal->afterNmsTopN();
-    auto minSize      = mProposal->minSize();
+    auto featStride   = mCache.featStride;
+    auto preNmsTopN   = mCache.preNmsTopN;
+    auto nmsThreshold = mCache.nmsThreshold;
+    auto afterNmsTopN = mCache.afterNmsTopN;
+    auto minSize      = mCache.minSize;
 
     float* tmpScorePtr = (float*)mScoreBuffer.ptr();
     // download

+ 8 - 1
source/backend/cpu/CPUProposal.hpp

@@ -24,8 +24,15 @@ public:
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
+    struct ProposalCache {
+        int32_t featStride;
+        int32_t preNmsTopN;
+        int32_t minSize;
+        int32_t afterNmsTopN;
+        float nmsThreshold;
+    };
 private:
-    const Proposal *mProposal;
+    ProposalCache mCache;
     AutoStorage<float> mAnchors;
     MemChunk mScoreBuffer;
 };

+ 223 - 69
source/backend/cpu/KVCacheManager.cpp

@@ -13,7 +13,7 @@
 
 namespace MNN {
 
-// @brief  Translate an address to a hex number string
+// Translate an address to a hex number string
 static inline std::string addrToHex(void *addr) {
     std::string result = "";
     uint64_t n = (uint64_t)addr;
@@ -106,11 +106,27 @@ void KVCacheManager::unmapKVCache(size_t keySize, size_t valueSize)
 */
 void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
     /*===================================  Key  ===================================*/
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        auto new_key = Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), UP_DIV(mHeadDim, lP8), hP8 * lP8});
+        mBackend->onAcquireBuffer(new_key, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(
+                new_key->host<char>() + h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8
+            );
+        }
+        mPastKey.reset(new_key);
+    }
+    else if (mConfig.mQuantKey) {
         auto new_key = Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
         mBackend->onAcquireBuffer(new_key, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,  mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+            memcpy(
+                new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP
+            );
         }
         mPastKey.reset(new_key);
     }
@@ -118,7 +134,11 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
         auto new_key = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
         mBackend->onAcquireBuffer(new_key, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+            memcpy(
+                new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes
+            );
         }
         mPastKey.reset(new_key);
     }
@@ -128,7 +148,11 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
         mBackend->onAcquireBuffer(new_value, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+                memcpy(
+                    new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP,
+                    oldMaxLength * hP
+                );
             }
         }
         mPastValue.reset(new_value);
@@ -138,7 +162,11 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
         mBackend->onAcquireBuffer(new_value, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+                memcpy(
+                    new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes,
+                    oldMaxLength * hP * mBytes
+                );
             }
         }
         mPastValue.reset(new_value);
@@ -151,16 +179,35 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
 */
 void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
     /*===================================  Key  ===================================*/
+    if (mConfig.mUseInt8Kernel) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8
+            );
+        }
+        mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
+        mPastKey.reset();
+    }
     if (mConfig.mQuantKey) {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP
+            );
         }
         mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
         mPastKey.reset();
     }
     else {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes
+            );
         }
         mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
         mPastKey.reset();
@@ -169,7 +216,11 @@ void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
     if (mConfig.mQuantValue) {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP,
+                    oldMaxLength * hP
+                );
             }
         }
         mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC);
@@ -178,7 +229,11 @@ void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
     else {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes,
+                    oldMaxLength * hP * mBytes
+                );
             }
         }
         mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC);
@@ -189,14 +244,12 @@ void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
 /*
 **  @brief  Expand the size of kvcache files in disk
 */
-void KVCacheManager::expandKVCacheInDisk(int oldMaxLength) {
-    size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
-    size_t keySize      = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t valueSize    = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+void KVCacheManager::expandKVCacheInDisk(int oldMaxLength, int oldKeySize, int oldValueSize, int keySize, int valueSize) {
     // Step 1: Copy the old kvcache from files to temporary buffers in memory
     std::shared_ptr<Tensor> old_key, old_value;
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        old_key.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(oldMaxLength, hP8), UP_DIV(mHeadDim, lP8), hP8 * lP8}));
+    } else if (mConfig.mQuantKey) {
         old_key.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP}));
     } else {
         old_key.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP}));  
@@ -216,25 +269,49 @@ void KVCacheManager::expandKVCacheInDisk(int oldMaxLength) {
     resetKVCacheFileSize(keySize, valueSize);
     mmapKVCache(keySize, valueSize);
     // Step 3: Move the kvcache from temporary buffers in memory to disk
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                old_key->host<char>() + h * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8
+            );
+        }
+    } else if (mConfig.mQuantKey) {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,  old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,
+                old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP
+            );
         }
     } else {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes,
+                old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes
+            );
         }
     }
     if (mConfig.mQuantValue) {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP,
+                    old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP,
+                    oldMaxLength * hP
+                );
             }
         }
     } else {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes,
+                    old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes,
+                    oldMaxLength * hP * mBytes
+                );
             }
         }
     }
@@ -253,12 +330,22 @@ void KVCacheManager::onResize(int kv_num_head, int head_dim) {
     if (mThreadNum > mKvNumHead) {
         mThreadNum = mKvNumHead;
     }
+    if (mConfig.mUseInt8Kernel) {
+        static_cast<CPUBackend *>(mBackend)->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
+    }
 }
 
 void KVCacheManager::onAlloc(int kv_seq_len) {
     mMaxLength = kv_seq_len + mConfig.mExpandChunk;
-    size_t keySize   = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    size_t keySize = 0, valueSize = 0;
+    if (mConfig.mUseInt8Kernel) {
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+    } else if (mConfig.mQuantKey) {
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+    } else {
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+    }
+    valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
     /*============== Put the kvcache in disk ===========*/
     if (mConfig.mKVCacheSizeLimit != -1 && keySize + valueSize > mConfig.mKVCacheSizeLimit) {
         createKVCacheFile();
@@ -268,7 +355,9 @@ void KVCacheManager::onAlloc(int kv_seq_len) {
     }
     /*============== Put the kvcache in memory ===========*/
     else {
-        if (mConfig.mQuantKey) {
+        if (mConfig.mUseInt8Kernel) {
+            mPastKey.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), UP_DIV(mHeadDim, lP8), hP8 * lP8}));
+        } else if (mConfig.mQuantKey) {
             mPastKey.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
         } else {
             mPastKey.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
@@ -278,15 +367,22 @@ void KVCacheManager::onAlloc(int kv_seq_len) {
         } else {
             mPastValue.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP}));
         }
-        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC);
-        mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC);    
-    }
-    /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */
-    if (mConfig.mQuantKey) {
-        mDequantKeyScale.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}));
-        mDequantKeyZeroPoint.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}));
-        mBackend->onAcquireBuffer(mDequantKeyScale.get(), Backend::STATIC);
-        mBackend->onAcquireBuffer(mDequantKeyZeroPoint.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC); 
+        mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC); 
+    }
+    // scale, zero point and sum of key for quantization
+    if (mConfig.mUseInt8Kernel) {
+        mKeyScale.reset(Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8}));
+        mKeyZeroPoint.reset(Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8}));
+        mKeySum.reset(Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8}));
+        mBackend->onAcquireBuffer(mKeyScale.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mKeyZeroPoint.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mKeySum.get(), Backend::STATIC);
+    } else if (mConfig.mQuantKey) {
+        mKeyScale.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), hP}));
+        mKeyZeroPoint.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), hP}));
+        mBackend->onAcquireBuffer(mKeyScale.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mKeyZeroPoint.get(), Backend::STATIC);
     }
 }
 
@@ -296,10 +392,19 @@ void KVCacheManager::onRealloc(int kv_seq_len) {
     }
     int oldMaxLength = mMaxLength;
     mMaxLength = kv_seq_len + mConfig.mExpandChunk;
-    size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
-    size_t keySize      = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t valueSize    = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    size_t oldKeySize, oldValueSize, keySize, valueSize;
+    if (mConfig.mUseInt8Kernel) {
+        oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+    } else if (mConfig.mQuantKey) {
+        oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP;
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+    } else {
+        oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes;
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+    }
+    oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
     /*==== No limit for kvcache ====*/
     if (mConfig.mKVCacheSizeLimit == -1) {
         expandKVCacheInMem(oldMaxLength);
@@ -318,51 +423,100 @@ void KVCacheManager::onRealloc(int kv_seq_len) {
     }
     /*==== Last time the kvcache is disk, now it should be in disk too ====*/
     else {
-        expandKVCacheInDisk(oldMaxLength);
+        expandKVCacheInDisk(oldMaxLength, oldKeySize, oldValueSize, keySize, valueSize);
     }
     /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        auto new_scale = Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8});
+        auto new_zeroPoint = Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8});
+        auto new_sum = Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8});
+        mBackend->onAcquireBuffer(new_scale, Backend::STATIC);
+        mBackend->onAcquireBuffer(new_zeroPoint, Backend::STATIC);
+        mBackend->onAcquireBuffer(new_sum, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP8) * hP8 * 4, mKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP8) * hP8 * 4, UP_DIV(oldMaxLength, hP8) * hP8 * 4);
+            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP8) * hP8 * 4, mKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP8) * hP8 * 4, UP_DIV(oldMaxLength, hP8) * hP8 * 4);
+            memcpy(new_sum->host<char>() + h * UP_DIV(mMaxLength, hP8) * hP8 * 4, mKeySum->host<char>() + h * UP_DIV(oldMaxLength, hP8) * hP8 * 4, UP_DIV(oldMaxLength, hP8) * hP8 * 4);
+        }
+        mKeyScale.reset(new_scale);
+        mKeyZeroPoint.reset(new_zeroPoint);
+        mKeySum.reset(new_sum);
+    } else if (mConfig.mQuantKey) {
         auto new_scale = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP});
         auto new_zeroPoint = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP});
         mBackend->onAcquireBuffer(new_scale, Backend::STATIC);
         mBackend->onAcquireBuffer(new_zeroPoint, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
-            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
+            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
+            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
         }
-        mDequantKeyScale.reset(new_scale);
-        mDequantKeyZeroPoint.reset(new_zeroPoint);
+        mKeyScale.reset(new_scale);
+        mKeyZeroPoint.reset(new_zeroPoint);
     }
 }
 
 void KVCacheManager::onClear() {
     if (mKVCacheInDisk) {
-        size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-        size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
-        unmapKVCache(oldKeySize, oldValueSize);
+        size_t keySize = 0, valueSize = 0;
+        if (mConfig.mUseInt8Kernel) {
+            keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+        } else if (mConfig.mQuantKey) {
+            keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+        } else {
+            keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+        }
+        valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);    
+        unmapKVCache(keySize, valueSize);
         removeKVCacheFile();
         mKVCacheInDisk = false;
     }
-    else {
-        mPastKey.reset();
-        mPastValue.reset();
-    }
+    mPastKey.reset();
+    mPastValue.reset();
+    mKeyScale.reset();
+    mKeyZeroPoint.reset();
+    mKeySum.reset();
     mMaxLength = mPastLength = 0;
 }
 
 template <typename T>
-static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim,
-                    int hP, int kv_h, bool quantKey, char* scale, char* zero_point, const MNN::CoreFunctions * core) {
-    if (quantKey) {
-        int8_t * key_dst = reinterpret_cast<int8_t*>(pack_key);
-        T * scale_dst = reinterpret_cast<T*>(scale);
-        T * zeroPoint_dst = reinterpret_cast<T*>(zero_point);
+void KVCacheManager::pack_key(const Tensor* key, int seq_len, int kv_h) {
+    if (mConfig.mUseInt8Kernel) {  // [maxlen/hP8, headdim/lP8, hP8, lP8]
+        int8_t * key_dst = reinterpret_cast<int8_t*>(addrOfKey(kv_h));
+        float * scale_dst = reinterpret_cast<float*>(addrOfScale(kv_h));
+        float * zeroPoint_dst = reinterpret_cast<float*>(addrOfZeroPoint(kv_h));
+        float * sum_dst = reinterpret_cast<float*>(addrOfKeySum(kv_h));
+        for (int s = 0; s < seq_len; s++) {
+            T * key_src = key->host<T>() + s * mKvNumHead * mHeadDim + kv_h * mHeadDim;
+            float minKey = key_src[0];
+            float maxKey = key_src[0];
+            float sumKey = key_src[0];
+            for (int d = 1; d < mHeadDim; d++) {
+                minKey = ALIMIN(minKey, key_src[d]);
+                maxKey = ALIMAX(maxKey, key_src[d]);
+                sumKey += key_src[d];
+            }
+            int out_index = (mPastLength + s) / hP8;
+            int in_index  = (mPastLength + s) % hP8;
+            scale_dst[out_index * hP8 + in_index] = (maxKey - minKey) / 255.0f;
+            zeroPoint_dst[out_index * hP8 + in_index] = -255.0f * minKey / (maxKey - minKey) - 128.0;
+            sum_dst[out_index * hP8 + in_index] = sumKey;
+            for (int d = 0; d < mHeadDim; d++) {
+                int i = d / lP8;
+                int j = d % lP8;
+                key_dst[out_index * UP_DIV(mHeadDim, lP8) * hP8 * lP8 + i * hP8 * lP8 + in_index * lP8 + j] = roundf((key_src[d] - minKey) / (maxKey - minKey) * 255.0f - 128.0f);
+            }
+        }
+    }
+    else if (mConfig.mQuantKey) {  // [maxlen/hP, headdim, hP]
+        int8_t * key_dst = reinterpret_cast<int8_t*>(addrOfKey(kv_h));
+        T * scale_dst = reinterpret_cast<T*>(addrOfScale(kv_h));
+        T * zeroPoint_dst = reinterpret_cast<T*>(addrOfZeroPoint(kv_h));
         for (int i = 0; i < seq_len; i++) {
             T * key_src = key->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             int out_index = (mPastLength + i) / hP;
             int in_index  = (mPastLength + i) % hP;
             T minKey, maxKey;
-            core->MNNCountMaxMinValue((float*)key_src, (float*)&minKey, (float*)&maxKey, mHeadDim);
+            static_cast<CPUBackend*>(mBackend)->functions()->MNNCountMaxMinValue((float*)key_src, (float*)&minKey, (float*)&maxKey, mHeadDim);
             scale_dst[out_index * hP + in_index] = (maxKey - minKey) / 255.0f;
             zeroPoint_dst[out_index * hP + in_index] = 128.0f * (maxKey - minKey) / 255.0f + minKey;
             for (int j = 0; j < mHeadDim; j++) {
@@ -370,8 +524,8 @@ static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq
             }
         }
     }
-    else {
-        T * key_dst = reinterpret_cast<T*>(pack_key);
+    else { // [maxlen/hP, headdim, hP]
+        T * key_dst = reinterpret_cast<T*>(addrOfKey(kv_h));
         for (int i = 0; i < seq_len; i++) {
             T * key_src = key->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             int out_index = (mPastLength + i) / hP;
@@ -384,16 +538,16 @@ static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq
 }
 
 template <typename T>
-static void pack_value(const Tensor* value, char* pack_value, int mMaxLength, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, bool quantValue, const MNN::CoreFunctions * core) {
-    if (quantValue) {
-        fp8_t * value_dst = reinterpret_cast<fp8_t*>(pack_value);
+void KVCacheManager::pack_value(const Tensor* value, int seq_len, int kv_h) { // [headdim/hP, maxlen, hP]
+    if (mConfig.mQuantValue) {
+        fp8_t * value_dst = reinterpret_cast<fp8_t*>(addrOfValue(kv_h));
         uint8_t * buf = (uint8_t *)MNNMemoryAllocAlign(mHeadDim, MNN_MEMORY_ALIGN_DEFAULT);
         for (int i = 0; i < seq_len; i++) {
             T * value_src = value->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             if (sizeof(T) == 2) {
-                core->MNNFp16ToFp8(buf, (uint16_t*)value_src, mHeadDim);
+                static_cast<CPUBackend*>(mBackend)->functions()->MNNFp16ToFp8(buf, (uint16_t*)value_src, mHeadDim);
             } else {
-                core->MNNFp32ToFp8(buf, (float*)value_src, mHeadDim);
+                static_cast<CPUBackend*>(mBackend)->functions()->MNNFp32ToFp8(buf, (float*)value_src, mHeadDim);
             }
             for (int j = 0; j < mHeadDim; j++) {
                 int out_index = j / hP;
@@ -404,7 +558,7 @@ static void pack_value(const Tensor* value, char* pack_value, int mMaxLength, in
         MNNMemoryFreeAlign(buf);
     }
     else {
-        T * value_dst = reinterpret_cast<T*>(pack_value);
+        T * value_dst = reinterpret_cast<T*>(addrOfValue(kv_h));
         for (int i = 0; i < seq_len; i++) {
             T * value_src = value->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             for (int j = 0; j < mHeadDim; j++) {
@@ -423,11 +577,11 @@ void KVCacheManager::onPushBack(const Tensor * key, const Tensor * value) {
     std::function<void(int)> packKV = [=](int tid) {
         for (int kv_h = tid * tileCount; kv_h < (tid+1) * tileCount && kv_h < mKvNumHead; kv_h++) {
             if (mBytes == 2) {
-                pack_key<FLOAT16_T>(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core);
-                pack_value<FLOAT16_T>(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core);
+                pack_key<FLOAT16_T>(key, seq_len, kv_h);
+                pack_value<FLOAT16_T>(value, seq_len, kv_h);
             } else {
-                pack_key<float>(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core);
-                pack_value<float>(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core);
+                pack_key<float>(key, seq_len, kv_h);
+                pack_value<float>(value, seq_len, kv_h);
             }
         }
     };

+ 51 - 19
source/backend/cpu/KVCacheManager.hpp

@@ -29,8 +29,9 @@ namespace MNN {
 class KVCacheManager : public NonCopyable{
 public:
     struct KVCacheConfig {
-        bool mQuantKey   = false;               // Quantize keys to int8 or not
-        bool mQuantValue = false;               // Quantize values to fp8 or not
+        bool mQuantKey      = false;            // Quantize keys to int8 or not
+        bool mQuantValue    = false;            // Quantize values to fp8 or not
+        bool mUseInt8Kernel = false;            // Whether to use int8 gemm kernel in CPU attention
         std::string mKVCacheDir = "/tmp";       // Path of the kvcache files in disk
         size_t mKVCacheSizeLimit = -1;          // The limit of the kvcache size
         int  mExpandChunk = 64;                 // Number of expand chunks when the buffer is full
@@ -38,10 +39,11 @@ public:
 private:
     Backend * mBackend;
     KVCacheConfig mConfig;
-    std::shared_ptr<Tensor> mPastKey;               // numhead, [maxlen/eP, headdim, eP]
-    std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/eP, maxlen, eP]
-    std::shared_ptr<Tensor> mDequantKeyScale;       // numhead, [maxlen/eP, 1, eP]
-    std::shared_ptr<Tensor> mDequantKeyZeroPoint;   // numhead, [maxlen/eP, 1, eP]
+    std::shared_ptr<Tensor> mPastKey;               // {numhead, [maxlen/hP, headdim, hP]} or {numhead, [maxlen/hP8, headdim/lP8, hP8, lP8]} 
+    std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/hP, maxlen, hP]
+    std::shared_ptr<Tensor> mKeyScale;              // {numhead, [maxlen/hP, hP]} or {numhead, [maxlen/hP8, hP8]}
+    std::shared_ptr<Tensor> mKeyZeroPoint;          // {numhead, [maxlen/hP, hP]} or {numhead, [maxlen/hP8, hP8]}
+    std::shared_ptr<Tensor> mKeySum;                // numhead, [maxlen/hP8, hP8]
     file_t mKeyCacheFD   = INVALID_FILE;            // The file descriptor of keys
     file_t mValueCacheFD = INVALID_FILE;            // The file descriptor of values
     char * mMapKeyAddr   = nullptr;                 // Memory-mapped address of keys
@@ -49,8 +51,10 @@ private:
     bool mKVCacheInDisk  = false;                   // Whether the kvcache is in disk or in memory now
     int  mPastLength     = 0;                       // Length of past kvcache
     int  mMaxLength      = 0;                       // Capacity of current kvcache buffer (how many kv items can be stored at most)
-    int  eP, lP, hP, mBytes, mThreadNum;
-    int  mKvNumHead = 0, mHeadDim   = 0;
+    int  eP, lP, hP;                                // Packing mode for float matmul
+    int  eP8, lP8, hP8;                             // Packing mode for int8 gemm kernel
+    int  mBytes = 4, mThreadNum = 1;
+    int  mKvNumHead = 0, mHeadDim = 0;
     void createKVCacheFile();
     void removeKVCacheFile();
     void resetKVCacheFileSize(size_t keySize, size_t valueSize);
@@ -58,7 +62,9 @@ private:
     void unmapKVCache(size_t keySize, size_t valueSize);
     void expandKVCacheInMem(int oldMaxLength);
     void moveKVCacheFromMemToDisk(int oldMaxLength);
-    void expandKVCacheInDisk(int oldMaxLength);
+    void expandKVCacheInDisk(int oldMaxLength, int oldKeySize, int oldValueSize, int keySize, int valueSize);
+    template <typename T> void pack_key(const Tensor* key, int seq_len, int kv_h);
+    template <typename T> void pack_value(const Tensor* value, int seq_len, int kv_h);
 public:
     KVCacheManager(Backend * backend, KVCacheConfig & kvConfig) {
         mBackend   = backend;
@@ -80,10 +86,13 @@ public:
         return mPastValue.get();
     }
     const Tensor * scale() {
-        return mDequantKeyScale.get();
+        return mKeyScale.get();
     }
     const Tensor * zeroPoint() {
-        return mDequantKeyZeroPoint.get();
+        return mKeyZeroPoint.get();
+    }
+    const Tensor * keySum() {
+        return mKeySum.get();
     }
     bool inDisk() {
         return mKVCacheInDisk;
@@ -96,23 +105,46 @@ public:
     }
     char * addrOfKey(int kv_h) {
         char * baseAddr = mKVCacheInDisk ? mMapKeyAddr : mPastKey->host<char>();
-        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+        if (mConfig.mUseInt8Kernel) {
+            return baseAddr + kv_h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+        } else if (mConfig.mQuantKey) {
+            return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+        } else {
+            return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+        }
     }
     char * addrOfValue(int kv_h) {
         char * baseAddr = mKVCacheInDisk ? mMapValueAddr : mPastValue->host<char>();
-        return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+        if (mConfig.mQuantValue) {
+            return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP;
+        } else {
+            return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP * mBytes;
+        }
     }
     char * addrOfScale(int kv_h) {
-        if (mConfig.mQuantKey == false)
+        if (mConfig.mUseInt8Kernel) {
+            return mKeyScale->host<char>() + kv_h * UP_DIV(mMaxLength, hP8) * hP8 * 4;
+        } else if (mConfig.mQuantKey) {
+            return mKeyScale->host<char>() + kv_h * UP_DIV(mMaxLength, hP) * hP * mBytes;
+        } else {
             return nullptr;
-        char * baseAddr = mDequantKeyScale->host<char>();
-        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes;
+        }
     }
     char * addrOfZeroPoint(int kv_h) {
-        if (mConfig.mQuantKey == false)
+        if (mConfig.mUseInt8Kernel) {
+            return mKeyZeroPoint->host<char>() + kv_h * UP_DIV(mMaxLength, hP8) * hP8 * 4;
+        } else if (mConfig.mQuantKey) {
+            return mKeyZeroPoint->host<char>() + kv_h * UP_DIV(mMaxLength, hP) * hP * mBytes;
+        } else {
+            return nullptr;
+        }
+    }
+    char * addrOfKeySum(int kv_h) {
+        if (mConfig.mUseInt8Kernel) {
+            return mKeySum->host<char>() + kv_h * UP_DIV(mMaxLength, hP8) * hP8 * 4;
+        }else {
             return nullptr;
-        char * baseAddr = mDequantKeyZeroPoint->host<char>();
-        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes;
+        }
     }
     void onResize(int kv_num_head, int head_dim);
     void onAlloc(int kv_seq_len);

+ 4 - 0
source/backend/cpu/arm/CMakeLists.txt

@@ -15,6 +15,10 @@ if (MNN_LOW_MEMORY)
     FILE(GLOB MNN_AArch64_SRC ${MNN_AArch64_SRC} ${CMAKE_CURRENT_LIST_DIR}/arm64/low_memory/*.[sS])
 endif()
 
+if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+    FILE(GLOB MNN_AArch64_SRC ${MNN_AArch64_SRC} ${CMAKE_CURRENT_LIST_DIR}/arm64/normal_memory/*.[sS])
+endif()
+
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
     message(STATUS "Enabling AArch32 Assemblies")
     add_library(MNNARM32 OBJECT ${MNN_AArch32_SRC} ${MNN_NEON_SRC})

+ 33 - 0
source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S

@@ -0,0 +1,33 @@
+//
+//  MNNBGRAToBGRC8.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRAToBGRC8
+// void MNNBGRAToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]!
+vst3.8 {d0, d1, d2}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 43 - 0
source/backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S

@@ -0,0 +1,43 @@
+//
+//  MNNBGRAToGRAYFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRAToGRAYFast
+// void MNNBGRAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]!
+vmull.u16 q4, d0, d4 // b*7
+vmlal.u16 q4, d1, d5 // g*38
+vmlal.u16 q4, d2, d6 // r*19
+
+vqshrn.u16 d8, q4, #6
+vst1.u8 {d8}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 46 - 0
source/backend/cpu/arm/arm32/MNNBGRToBGR555Fast.S

@@ -0,0 +1,46 @@
+//
+//  MNNBGRToBGR555Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRToBGR555Fast
+// void MNNBGRToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vand.u8 d2, d2, d30 // r & ~7
+vand.u8 d1, d1, d30 // g & ~7
+vshr.u8 d0, d0, #3  // b >> 3
+vshll.u8 q2, d2, #7
+vshll.u8 q3, d1, #2
+vmovl.u8 q8, d0
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 51 - 0
source/backend/cpu/arm/arm32/MNNBGRToBGR565Fast.S

@@ -0,0 +1,51 @@
+//
+//  MNNBGRToBGR565Fast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRToBGR565Fast
+// void MNNBGRToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+
+push {lr}
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+vmov.s8 q14, #4
+vneg.s8 q14, q14
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]! // b, g, r
+
+vand.u8 d2, d2, d30 // r & ~7
+vand.u8 d1, d1, d28 // g & ~3
+vshr.u8 d0, d0, #3  // b >> 3
+
+vshll.u8 q2, d2, #7
+vshl.u8 q2, q2, #1
+vshll.u8 q3, d1, #3
+vmovl.u8 q8, d0
+
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 46 - 0
source/backend/cpu/arm/arm32/MNNBGRToGRAYFast.S

@@ -0,0 +1,46 @@
+//
+//  MNNBGRToGRAYFast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRToGRAYFast
+// void MNNBGRToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+// b*7
+// g*38
+// r*19
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]! // b,g,r
+vmull.u8 q8, d0, d4
+vmlal.u8 q8, d1, d5
+vmlal.u8 q8, d2, d6
+
+vqshrn.u16 d16, q8, #6
+vst1.8 {d16}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 34 - 0
source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S

@@ -0,0 +1,34 @@
+//
+//  MNNC3ToC4Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNC3ToC4Fast
+// void MNNC3ToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d3, #255
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vst4.u8 {d0, d1, d2, d3}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 95 - 0
source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S

@@ -0,0 +1,95 @@
+//
+//  MNNC3ToXYZFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNC3ToXYZFast
+// void MNNC3ToXYZFast(const unsigned char* source, unsigned char* dest, size_t count, int32_t* c);
+// Auto Load: r0: source, r1: dest, r2: count, r3: c
+
+push {lr}
+vpush {q4-q7}
+
+// q4-q6, const
+vld1.32 {d8[0]}, [r3]! // C0
+vld1.32 {d8[1]}, [r3]! // C1
+vld1.32 {d9[0]}, [r3]! // C2
+vld1.32 {d9[1]}, [r3]! // C3
+vld1.32 {d10[0]}, [r3]! // C4
+vld1.32 {d10[1]}, [r3]! // C5
+vld1.32 {d11[0]}, [r3]! // C6
+vld1.32 {d11[1]}, [r3]! // C7
+vld1.32 {d12[0]}, [r3]! // C8
+
+vmov.u16 q15, #128
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmovl.u8 q2, d0 // r: uint8_t -> uint16_t
+vmovl.u8 q3, d1
+vmovl.u8 q13, d2
+
+vmovl.u16 q7, d4 // r
+vmovl.u16 q8, d5 // r
+vmovl.u16 q9, d6 // g
+vmovl.u16 q10, d7 // g
+vmovl.u16 q11, d26 // b
+vmovl.u16 q12, d27 // b
+
+// r*C0, g*C1, b*C2
+vmul.s32 q0, q7, d8[0]
+vmul.s32 q1, q8, d8[0]
+vmla.s32 q0, q9, d8[1]
+vmla.s32 q1, q10, d8[1]
+vmla.s32 q0, q11, d9[0]
+vmla.s32 q1, q12, d9[0]
+
+// r*C3, g*C4, b*C5
+vmul.s32 q2, q7, d9[1]
+vmul.s32 q3, q8, d9[1]
+vmla.s32 q2, q9, d10[0]
+vmla.s32 q3, q10, d10[0]
+vmla.s32 q2, q11, d10[1]
+vmla.s32 q3, q12, d10[1]
+
+// r*C6, g*C7, b*C8
+vmul.s32 q13, q7, d11[0]
+vmul.s32 q14, q8, d11[0]
+vmla.s32 q13, q9, d11[1]
+vmla.s32 q14, q10, d11[1]
+vmla.s32 q13, q11, d12[0]
+vmla.s32 q14, q12, d12[0]
+
+vrshrn.u32 d0, q0, #12
+vrshrn.u32 d1, q1, #12
+vrshrn.u32 d2, q2, #12
+vrshrn.u32 d3, q3, #12
+vrshrn.u32 d4, q13, #12
+vrshrn.u32 d5, q14, #12
+
+vqmovn.u16 d0, q0
+vqmovn.u16 d1, q1
+vqmovn.u16 d2, q2
+
+vst3.8 {d0, d1, d2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+vpop {q4-q7}
+pop {pc}
+
+#endif
+#endif

+ 98 - 0
source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S

@@ -0,0 +1,98 @@
+//
+//  MNNC3ToYUVFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNC3ToYUVFast
+// void MNNC3ToYUVFast(const unsigned char* source, unsigned char* dest, size_t count, int32_t* c);
+// Auto Load: r0: source, r1: dest, r2: count, r3: c
+
+push {lr}
+vpush {q4-q7}
+
+// q4-q6, const
+vld1.32 {d8[0]}, [r3]! // C0
+vld1.32 {d8[1]}, [r3]! // C1
+vld1.32 {d9[0]}, [r3]! // C2
+vld1.32 {d9[1]}, [r3]! // C3
+vld1.32 {d10[0]}, [r3]! // C4
+vld1.32 {d10[1]}, [r3]! // C5
+vld1.32 {d11[0]}, [r3]! // C6
+vld1.32 {d11[1]}, [r3]! // C7
+vld1.32 {d12[0]}, [r3]! // C8
+
+vmov.u16 q15, #128
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmovl.u8 q2, d0 // r: uint8_t -> uint16_t
+vmovl.u8 q3, d1
+vmovl.u8 q13, d2
+
+vmovl.u16 q7, d4 // r
+vmovl.u16 q8, d5 // r
+vmovl.u16 q9, d6 // g
+vmovl.u16 q10, d7 // g
+vmovl.u16 q11, d26 // b
+vmovl.u16 q12, d27 // b
+
+// r*C0, g*C1, b*C2
+vmul.s32 q0, q7, d8[0]
+vmul.s32 q1, q8, d8[0]
+vmla.s32 q0, q9, d8[1]
+vmla.s32 q1, q10, d8[1]
+vmla.s32 q0, q11, d9[0]
+vmla.s32 q1, q12, d9[0]
+
+// r*C3, g*C4, b*C5
+vmul.s32 q2, q7, d9[1]
+vmul.s32 q3, q8, d9[1]
+vmla.s32 q2, q9, d10[0]
+vmla.s32 q3, q10, d10[0]
+vmla.s32 q2, q11, d10[1]
+vmla.s32 q3, q12, d10[1]
+
+// r*C6, g*C7, b*C8
+vmul.s32 q13, q7, d11[0]
+vmul.s32 q14, q8, d11[0]
+vmla.s32 q13, q9, d11[1]
+vmla.s32 q14, q10, d11[1]
+vmla.s32 q13, q11, d12[0]
+vmla.s32 q14, q12, d12[0]
+
+vrshrn.u32 d0, q0, #14
+vrshrn.u32 d1, q1, #14
+vrshrn.u32 d2, q2, #14
+vrshrn.u32 d3, q3, #14
+vrshrn.u32 d4, q13, #14
+vrshrn.u32 d5, q14, #14
+
+vadd.u16 q1, q1, q15
+vadd.u16 q2, q2, q15
+
+vqmovn.u16 d0, q0
+vqmovn.u16 d1, q1
+vqmovn.u16 d2, q2
+
+vst3.8 {d0, d1, d2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+vpop {q4-q7}
+pop {pc}
+
+#endif
+#endif

+ 33 - 10
source/backend/cpu/arm/arm32/MNNFloat2Int8.S

@@ -22,26 +22,49 @@ vcvt.s32.f32 \x, q13
 .endm
 
 asm_function MNNFloat2Int8
-//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax, ssize_t zeroPoint);
-//r0:src, r1:dst, r2:sizeQuad, r3:scale, r4:aMin, r5:aMax, r6:zeroPoint
-
+//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax, float* zeroPoint, ssize_t quanParamVec);
+// Auto load: r0:src, r1:dst, r2:sizeQuad, r3:scale
+// Load from sp: aMin, aMax, lr: zeroPoint, r12: quanParamVec
 push {lr}
 
 vmov.f32 q10, #0.5
 vmov.f32 q11, #-0.5
-
-ldr r12, [sp, #4]
-vld1.32 {q15}, [r3]
+vmov.s32 q1, #1
+// scale
+vld1.32 {d30[0]}, [r3]
+vdup.32 q15, d30[0]
 // min
+ldr r12, [sp, #4]
 vdup.s8 d28, r12
 // max
 ldr r12, [sp, #8]
 vdup.s8 d29, r12
 // zeropoint
-ldr r12, [sp, #12]
-vdup.s32 q9, r12
-vcvt.f32.s32 q9, q9
-
+ldr lr, [sp, #12]
+vld1.32 {d18[0]}, [lr]
+vdup.32 q9, d18[0]
+
+// quanParamVec
+ldr r12, [sp, #16]
+cmp r12, #3
+bne LOAD_VEC_ZERO
+vld1.f32 {q9}, [lr] // load vector zero
+vld1.f32 {q15}, [r3] // load vector scale
+b COMPUTE
+
+LOAD_VEC_ZERO:
+cmp r12, #2
+bne LOAD_VEC_SCALE
+vld1.f32 {q9}, [lr] // load vector zero
+b COMPUTE
+
+LOAD_VEC_SCALE:
+cmp r12, #1
+bne COMPUTE
+vld1.f32 {q15}, [r3] // load vector scale
+
+
+COMPUTE:
 cmp r2, #3
 ble FL1
 

+ 35 - 0
source/backend/cpu/arm/arm32/MNNGRAYToC3Fast.S

@@ -0,0 +1,35 @@
+//
+//  MNNGRAYToC3Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNGRAYToC3Fast
+// void MNNGRAYToC3Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld1.8 {d0}, [r0]!
+vmov d1, d0
+vmov d2, d0
+vst3.u8 {d0, d1, d2}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 36 - 0
source/backend/cpu/arm/arm32/MNNGRAYToC4Fast.S

@@ -0,0 +1,36 @@
+//
+//  MNNGRAYToC4Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNGRAYToC4Fast
+// void MNNGRAYToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d3, #255
+L1:
+vld1.8 {d0}, [r0]!
+vmov d1, d0
+vmov d2, d0
+vst4.u8 {d0, d1, d2, d3}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 2 - 1
source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S

@@ -51,7 +51,7 @@ ldr r8, [r6, #0]
 ldr lr, [r6, #4]
 
 vpush {q4-q7}
-
+sub sp, sp, #36
 
 ldr r7, [r6, #16]  // r7: useInt8
 
@@ -418,6 +418,7 @@ L1LoopCheck:
     bne L1LoopDz
 
 End:
+add sp, sp, #36
 vpop {q4-q7}
 pop {r4-r8, r10, pc}
 

+ 2 - 0
source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S

@@ -42,6 +42,7 @@ ldr r8, [r6, #0]
 ldr lr, [r6, #4]
 
 vpush {q4-q7}
+sub sp, sp, #36
 
 // Only int8 output use this kernel.
 
@@ -301,6 +302,7 @@ L1LoopCheck:
     bne L1LoopDz
 
 End:
+add sp, sp, #36
 vpop {q4-q7}
 pop {r4-r8, r10, pc}
 

+ 2 - 1
source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S

@@ -51,7 +51,7 @@ ldr r8, [r6, #0]
 ldr lr, [r6, #4]
 
 vpush {q4-q7}
-
+sub sp, sp, #36
 // Branch1: input is int8_t, output is float32, DO NOT USE "scale".
 // Branch2: input is int8_t, output is float32. USE "scale", DO NOT USE "minValue" and "maxValue".
 // Branch3: input is int8_t, output is int8_t.  USE "scale", "minValue" and "maxValue".
@@ -398,6 +398,7 @@ L1LoopCheck:
     bne L1LoopDz
 
 End:
+add sp, sp, #36
 vpop {q4-q7}
 pop {r4-r8, r10, pc}
 

+ 38 - 0
source/backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S

@@ -0,0 +1,38 @@
+//
+//  MNNRGBAToBGRAFast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBAToBGRAFast
+// void MNNRGBAToBGRAFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]! // r,g,b,a
+
+// swap d0,d2
+vmov.32 d4, d2
+vmov.32 d2, d0
+vmov.32 d0, d4
+vst4.8 {d0, d1, d2, d3}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 38 - 0
source/backend/cpu/arm/arm32/MNNRGBAToBGRFast.S

@@ -0,0 +1,38 @@
+//
+//  MNNRGBAToBGRFast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBAToBGRFast
+// void MNNRGBAToBGRFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]! // r,g,b,a
+
+// swap d0,d2
+vmov.32 d4, d2
+vmov.32 d2, d0
+vmov.32 d0, d4
+vst3.8 {d0, d1, d2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 43 - 0
source/backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S

@@ -0,0 +1,43 @@
+//
+//  MNNRGBAToGRAYFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBAToGRAYFast
+// void MNNRGBAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]!
+vmull.u8 q8, d2, d4 // b*7
+vmlal.u8 q8, d1, d5 // g*38
+vmlal.u8 q8, d0, d6 // r*19
+
+vqshrn.u16 d16, q8, #6
+vst1.8 {d16}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 46 - 0
source/backend/cpu/arm/arm32/MNNRGBToBGR555Fast.S

@@ -0,0 +1,46 @@
+//
+//  MNNRGBToBGR555Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToBGR555Fast
+// void MNNRGBToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vand.u8 d0, d0, d30 // r & ~7
+vand.u8 d1, d1, d30 // g & ~7
+vshr.u8 d2, d2, #3  // b >> 3
+vshll.u8 q2, d0, #7
+vshll.u8 q3, d1, #2
+vmovl.u8 q8, d2
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 54 - 0
source/backend/cpu/arm/arm32/MNNRGBToBGR565Fast.S

@@ -0,0 +1,54 @@
+//
+//  MNNRGBToBGR565Fast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToBGR565Fast
+// void MNNRGBToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+// b*7
+// g*38
+// r*19
+
+push {lr}
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+vmov.s8 q14, #4
+vneg.s8 q14, q14
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]! // r,g,b
+
+vand.u8 d0, d0, d30 // r & ~7
+vand.u8 d1, d1, d28 // g & ~3
+vshr.u8 d2, d2, #3  // b >> 3
+
+vshll.u8 q2, d0, #7
+vshl.u8 q2, q2, #1
+vshll.u8 q3, d1, #3
+vmovl.u8 q8, d2
+
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 36 - 0
source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S

@@ -0,0 +1,36 @@
+//
+//  MNNRGBToBGRC8.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToBGRC8
+// void MNNRGBToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmov d3, d2
+vmov d4, d1
+vmov d5, d0
+vst3.8 {d3, d4, d5}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 43 - 0
source/backend/cpu/arm/arm32/MNNRGBToGRAYFast.S

@@ -0,0 +1,43 @@
+//
+//  MNNRGBToGRAYFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToGRAYFast
+// void MNNRGBToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmull.u8 q8, d2, d4 // b*7
+vmlal.u8 q8, d1, d5 // g*38
+vmlal.u8 q8, d0, d6 // r*19
+
+vqshrn.u16 d16, q8, #6
+vst1.8 {d16}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif

+ 129 - 0
source/backend/cpu/arm/arm64/MNNBGRAToBGR.S

@@ -0,0 +1,129 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNBGRAToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRAToBGRC8
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L12:
+cmp x2, #12
+blt L8
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
+sub x2, x2, #12
+mov v16.16b, v0.16b
+mov v17.16b, v1.16b
+mov v18.16b, v2.16b
+mov v19.16b, v4.16b
+mov v20.16b, v5.16b
+mov v21.16b, v6.16b
+
+mov v22.16b, v8.16b
+mov v23.16b, v9.16b
+mov v24.16b, v10.16b
+mov v25.16b, v12.16b
+mov v26.16b, v13.16b
+mov v27.16b, v14.16b
+
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+
+mov v4.16b, v28.16b
+mov v5.16b, v29.16b
+mov v6.16b, v30.16b
+mov v8.16b, v0.16b
+mov v9.16b, v1.16b
+mov v10.16b, v2.16b
+
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v19.16b, v20.16b, v21.16b}, [x1], #48
+st3 {v22.16b, v23.16b, v24.16b}, [x1], #48
+st3 {v25.16b, v26.16b, v27.16b}, [x1], #48
+st3 {v4.16b, v5.16b, v6.16b}, [x1], #48
+st3 {v8.16b, v9.16b, v10.16b}, [x1], #48
+
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+sub x2, x2, #8
+mov v16.16b, v0.16b
+mov v17.16b, v1.16b
+mov v18.16b, v2.16b
+mov v19.16b, v4.16b
+mov v20.16b, v5.16b
+mov v21.16b, v6.16b
+
+mov v22.16b, v8.16b
+mov v23.16b, v9.16b
+mov v24.16b, v10.16b
+mov v25.16b, v12.16b
+mov v26.16b, v13.16b
+mov v27.16b, v14.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v19.16b, v20.16b, v21.16b}, [x1], #48
+st3 {v22.16b, v23.16b, v24.16b}, [x1], #48
+st3 {v25.16b, v26.16b, v27.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v6.16b, v7.16b, v8.16b, v9.16b}, [x0], #64
+sub x2, x2, #4
+mov v10.16b, v0.16b
+mov v11.16b, v1.16b
+mov v12.16b, v2.16b
+mov v13.16b, v6.16b
+mov v14.16b, v7.16b
+mov v15.16b, v8.16b
+
+st3 {v10.16b, v11.16b, v12.16b}, [x1], #48
+st3 {v13.16b, v14.16b, v15.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+mov v4.16b, v0.16b
+mov v5.16b, v1.16b
+mov v6.16b, v2.16b
+sub x2, x2, #2
+st3 {v4.16b, v5.16b, v6.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+mov v5.8b, v0.8b
+mov v6.8b, v1.8b
+mov v7.8b, v2.8b
+st3 {v5.8b, v6.8b, v7.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 92 - 0
source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S

@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRAToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #7
+movi v30.16b, #38
+movi v31.16b, #19
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v14.16b, v15.16b, v16.16b, v17.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+umull v18.8h, v14.8b, v29.8b // b*7
+umlal v18.8h, v15.8b, v30.8b // g*38
+umlal v18.8h, v16.8b, v31.8b // r*19
+
+umull2 v21.8h, v14.16b, v29.16b // b*7
+umlal2 v21.8h, v15.16b, v30.16b // g*38
+umlal2 v21.8h, v16.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 169 - 0
source/backend/cpu/arm/arm64/MNNBGRToBGR555.S

@@ -0,0 +1,169 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRToBGR555Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v11.16b, v11.16b, #3  // b >> 3
+and v26.16b, v26.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v31.16b // g & ~7
+ushr v24.16b, v24.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v2.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v11.16b
+
+ushll v6.8h, v26.8b, #7
+ushll v7.8h, v25.8b, #2
+uxtl v27.8h, v24.8b
+ushll2 v28.8h, v26.16b, #7
+ushll2 v29.8h, v25.16b, #2
+uxtl2 v30.8h, v24.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v11.16b, v11.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v2.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v11.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+sub x2, x2, #2
+ushr v0.16b, v0.16b, #3  // b >> 3
+
+ushll v3.8h, v2.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v0.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v2.8b, v2.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v31.8b // g & ~7
+ushr v0.8b, v0.8b, #3  // b >> 3
+ushll v2.8h, v2.8b, #7
+ushll v1.8h, v1.8b, #2
+uxtl v0.8h, v0.8b
+orr v3.16b, v0.16b, v1.16b
+orr v3.16b, v3.16b, v2.16b
+
+st1 {v3.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 187 - 0
source/backend/cpu/arm/arm64/MNNBGRToBGR565.S

@@ -0,0 +1,187 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRToBGR565Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+movi v30.16b, #4
+neg v30.16b, v30.16b
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v11.16b, v11.16b, #3  // b >> 3
+and v26.16b, v26.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v30.16b // g & ~3
+ushr v24.16b, v24.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v2.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v11.16b
+
+ushll v6.8h, v26.8b, #7
+shl v6.8h, v6.8h, #1
+ushll v7.8h, v25.8b, #3
+uxtl v27.8h, v24.8b
+ushll2 v28.8h, v26.16b, #7
+shl v28.8h, v28.8h, #1
+ushll2 v29.8h, v25.16b, #3
+uxtl2 v30.8h, v24.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+movi v30.16b, #4
+neg v30.16b, v30.16b
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v11.16b, v11.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v2.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v11.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+sub x2, x2, #2
+ushr v0.16b, v0.16b, #3  // b >> 3
+
+ushll v3.8h, v2.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v0.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v2.8b, v2.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v30.8b // g & ~3
+ushr v0.8b, v0.8b, #3  // b >> 3
+ushll v2.8h, v2.8b, #7
+shl v2.8h, v2.8h, #1
+ushll v1.8h, v1.8b, #3
+uxtl v0.8h, v0.8b
+orr v3.16b, v0.16b, v1.16b
+orr v3.16b, v3.16b, v2.16b
+
+st1 {v3.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 92 - 0
source/backend/cpu/arm/arm64/MNNBGRToGRAY.S

@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #7
+movi v30.16b, #38
+movi v31.16b, #19
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v14.16b, v15.16b, v16.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+umull v18.8h, v14.8b, v29.8b // b*7
+umlal v18.8h, v15.8b, v30.8b // g*38
+umlal v18.8h, v16.8b, v31.8b // r*19
+
+umull2 v21.8h, v14.16b, v29.16b // b*7
+umlal2 v21.8h, v15.16b, v30.16b // g*38
+umlal2 v21.8h, v16.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 116 - 0
source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S

@@ -0,0 +1,116 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNC3ToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNC3ToC4Fast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v3.16b, #255
+movi v7.16b, #255
+movi v11.16b, #255
+movi v15.16b, #255
+movi v19.16b, #255
+movi v23.16b, #255
+movi v27.16b, #255
+movi v31.16b, #255
+
+L16:
+cmp x2, #16
+blt L12
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v16.16b, v17.16b, v18.16b}, [x0], #48
+ld3 {v20.16b, v21.16b, v22.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+ld3 {v28.16b, v29.16b, v30.16b}, [x0], #48
+sub x2, x2, #16
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
+st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
+st4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], #64
+b L16
+
+L12:
+cmp x2, #12
+blt L8
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v16.16b, v17.16b, v18.16b}, [x0], #48
+ld3 {v20.16b, v21.16b, v22.16b}, [x0], #48
+sub x2, x2, #12
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
+st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+sub x2, x2, #8
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
+st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+sub x2, x2, #4
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+sub x2, x2, #2
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+
+st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], #32
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 88 - 0
source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S

@@ -0,0 +1,88 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNC3ToXYZFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNC3ToXYZFast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+ld1r {v23.4s}, [x3], #4
+ld1r {v24.4s}, [x3], #4
+ld1r {v25.4s}, [x3], #4
+ld1r {v26.4s}, [x3], #4
+ld1r {v27.4s}, [x3], #4
+ld1r {v28.4s}, [x3], #4
+ld1r {v29.4s}, [x3], #4
+ld1r {v30.4s}, [x3], #4
+ld1r {v31.4s}, [x3], #4
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+ushll v0.8h, v0.8b, #0 // r: uint8_t -> uint16_t
+ushll v1.8h, v1.8b, #0
+ushll v2.8h, v2.8b, #0
+
+uxtl v3.4s, v0.4h  // r
+uxtl2 v4.4s, v0.8h // r
+uxtl v5.4s, v1.4h  // g
+uxtl2 v6.4s, v1.8h  // g
+uxtl v7.4s, v2.4h  // b
+uxtl2 v8.4s, v2.8h // b
+
+// r*C0, g*C1, b*C2
+mul v9.4s, v3.4s, v23.4s
+mul v10.4s, v4.4s, v23.4s
+mla v9.4s, v5.4s, v24.4s
+mla v10.4s, v6.4s, v24.4s
+mla v9.4s, v7.4s, v25.4s
+mla v10.4s, v8.4s, v25.4s
+
+// r*C3, g*C4, b*C5
+mul v15.4s, v3.4s, v26.4s
+mul v16.4s, v4.4s, v26.4s
+mla v15.4s, v5.4s, v27.4s
+mla v16.4s, v6.4s, v27.4s
+mla v15.4s, v7.4s, v28.4s
+mla v16.4s, v8.4s, v28.4s
+
+// r*C6, g*C7, b*C8
+mul v21.4s, v3.4s, v29.4s
+mul v22.4s, v4.4s, v29.4s
+mla v21.4s, v5.4s, v30.4s
+mla v22.4s, v6.4s, v30.4s
+mla v21.4s, v7.4s, v31.4s
+mla v22.4s, v8.4s, v31.4s
+
+uqrshrn v11.4h, v9.4s, #12
+uqrshrn2 v11.8h, v10.4s, #12
+uqrshrn v12.4h, v15.4s, #12
+uqrshrn2 v12.8h, v16.4s, #12
+uqrshrn v13.4h, v21.4s, #12
+uqrshrn2 v13.8h, v22.4s, #12
+
+uqxtn v14.8b, v11.8h
+uqxtn v15.8b, v12.8h
+uqxtn v16.8b, v13.8h
+
+
+st3 {v14.8b, v15.8b, v16.8b}, [x1], #24 
+sub x2, x2, #1
+b L1
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 92 - 0
source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S

@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNC3ToYUVFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNC3ToYUVFast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+ld1r {v23.4s}, [x3], #4
+ld1r {v24.4s}, [x3], #4
+ld1r {v25.4s}, [x3], #4
+ld1r {v26.4s}, [x3], #4
+ld1r {v27.4s}, [x3], #4
+ld1r {v28.4s}, [x3], #4
+ld1r {v29.4s}, [x3], #4
+ld1r {v30.4s}, [x3], #4
+ld1r {v31.4s}, [x3], #4
+movi v17.8h, #128
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+ushll v0.8h, v0.8b, #0 // r: uint8_t -> uint16_t
+ushll v1.8h, v1.8b, #0
+ushll v2.8h, v2.8b, #0
+
+uxtl v3.4s, v0.4h  // r
+uxtl2 v4.4s, v0.8h // r
+uxtl v5.4s, v1.4h  // g
+uxtl2 v6.4s, v1.8h  // g
+uxtl v7.4s, v2.4h  // b
+uxtl2 v8.4s, v2.8h // b
+
+// r*C0, g*C1, b*C2
+mul v9.4s, v3.4s, v23.4s
+mul v10.4s, v4.4s, v23.4s
+mla v9.4s, v5.4s, v24.4s
+mla v10.4s, v6.4s, v24.4s
+mla v9.4s, v7.4s, v25.4s
+mla v10.4s, v8.4s, v25.4s
+
+// r*C3, g*C4, b*C5
+mul v15.4s, v3.4s, v26.4s
+mul v16.4s, v4.4s, v26.4s
+mla v15.4s, v5.4s, v27.4s
+mla v16.4s, v6.4s, v27.4s
+mla v15.4s, v7.4s, v28.4s
+mla v16.4s, v8.4s, v28.4s
+
+// r*C6, g*C7, b*C8
+mul v21.4s, v3.4s, v29.4s
+mul v22.4s, v4.4s, v29.4s
+mla v21.4s, v5.4s, v30.4s
+mla v22.4s, v6.4s, v30.4s
+mla v21.4s, v7.4s, v31.4s
+mla v22.4s, v8.4s, v31.4s
+
+uqrshrn v11.4h, v9.4s, #14
+uqrshrn2 v11.8h, v10.4s, #14
+uqrshrn v12.4h, v15.4s, #14
+uqrshrn2 v12.8h, v16.4s, #14
+uqrshrn v13.4h, v21.4s, #14
+uqrshrn2 v13.8h, v22.4s, #14
+
+add v12.8h, v12.8h, v17.8h
+add v13.8h, v13.8h, v17.8h
+
+uqxtn v14.8b, v11.8h
+uqxtn v15.8b, v12.8h
+uqxtn v16.8b, v13.8h
+
+
+st3 {v14.8b, v15.8b, v16.8b}, [x1], #24 
+sub x2, x2, #1
+b L1
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 83 - 77
source/backend/cpu/arm/arm64/MNNFloat2Int8.S

@@ -14,21 +14,35 @@
 .align 5
 
 asm_function MNNFloat2Int8
-//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, size_t zeroPoint);
-//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint
+//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, float* zeroPoint, ssize_t quanParamVec);
+//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint, x7: quanParamVec
 stp d14, d15, [sp, #-64]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 
-ld1 {v31.4s}, [x3]
+ld1r {v31.4s}, [x3]
 
 dup v30.16b, w4
 dup v29.16b, w5
 
 // copy zero point
-dup v28.4s, w6
-scvtf v28.4s, v28.4s
+ld1r {v28.4s}, [x6]
+
+cmp x7, #3
+bne LOAD_SCALE_VEC
+ld1 {v31.4s}, [x3] // scale
+ld1 {v28.4s}, [x6] // zero
+b FL32
+LOAD_SCALE_VEC:
+cmp x7, #1
+bne LOAD_ZERO_VEC
+ld1 {v31.4s}, [x3] // scale
+b FL32
+LOAD_ZERO_VEC:
+cmp x7, #2
+bne FL32
+ld1 {v28.4s}, [x6] // zero
 
 FL32:
 cmp x2, #32
@@ -44,58 +58,53 @@ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
 // ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
 // ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
-fadd v3.4s, v3.4s, v28.4s
-
 fmul v4.4s, v4.4s, v31.4s
-fadd v4.4s, v4.4s, v28.4s
 fmul v5.4s, v5.4s, v31.4s
-fadd v5.4s, v5.4s, v28.4s
 fmul v6.4s, v6.4s, v31.4s
-fadd v6.4s, v6.4s, v28.4s
 fmul v7.4s, v7.4s, v31.4s
-fadd v7.4s, v7.4s, v28.4s
-
 fmul v8.4s, v8.4s, v31.4s
-fadd v8.4s, v8.4s, v28.4s
 fmul v9.4s, v9.4s, v31.4s
-fadd v9.4s, v9.4s, v28.4s
 fmul v10.4s, v10.4s, v31.4s
-fadd v10.4s, v10.4s, v28.4s
 fmul v11.4s, v11.4s, v31.4s
-fadd v11.4s, v11.4s, v28.4s
-
 fmul v12.4s, v12.4s, v31.4s
-fadd v12.4s, v12.4s, v28.4s
 fmul v13.4s, v13.4s, v31.4s
-fadd v13.4s, v13.4s, v28.4s
 fmul v14.4s, v14.4s, v31.4s
-fadd v14.4s, v14.4s, v28.4s
 fmul v15.4s, v15.4s, v31.4s
-fadd v15.4s, v15.4s, v28.4s
-
-
 fmul v16.4s, v16.4s, v31.4s
-fadd v16.4s, v16.4s, v28.4s
 fmul v17.4s, v17.4s, v31.4s
-fadd v17.4s, v17.4s, v28.4s
 fmul v18.4s, v18.4s, v31.4s
-fadd v18.4s, v18.4s, v28.4s
 fmul v19.4s, v19.4s, v31.4s
-fadd v19.4s, v19.4s, v28.4s
-
 fmul v20.4s, v20.4s, v31.4s
-fadd v20.4s, v20.4s, v28.4s
 fmul v21.4s, v21.4s, v31.4s
-fadd v21.4s, v21.4s, v28.4s
 fmul v22.4s, v22.4s, v31.4s
-fadd v22.4s, v22.4s, v28.4s
 fmul v23.4s, v23.4s, v31.4s
+
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
+fadd v3.4s, v3.4s, v28.4s
+fadd v4.4s, v4.4s, v28.4s
+fadd v5.4s, v5.4s, v28.4s
+fadd v6.4s, v6.4s, v28.4s
+fadd v7.4s, v7.4s, v28.4s
+fadd v8.4s, v8.4s, v28.4s
+fadd v9.4s, v9.4s, v28.4s
+fadd v10.4s, v10.4s, v28.4s
+fadd v11.4s, v11.4s, v28.4s
+fadd v12.4s, v12.4s, v28.4s
+fadd v13.4s, v13.4s, v28.4s
+fadd v14.4s, v14.4s, v28.4s
+fadd v15.4s, v15.4s, v28.4s
+fadd v16.4s, v16.4s, v28.4s
+fadd v17.4s, v17.4s, v28.4s
+fadd v18.4s, v18.4s, v28.4s
+fadd v19.4s, v19.4s, v28.4s
+fadd v20.4s, v20.4s, v28.4s
+fadd v21.4s, v21.4s, v28.4s
+fadd v22.4s, v22.4s, v28.4s
 fadd v23.4s, v23.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
@@ -171,21 +180,21 @@ sqxtn2 v4.16b, v5.8h
 sqxtn2 v6.16b, v7.8h
 
 fmul v8.4s, v8.4s, v31.4s
-fadd v8.4s, v8.4s, v28.4s
 fmul v9.4s, v9.4s, v31.4s
-fadd v9.4s, v9.4s, v28.4s
 fmul v10.4s, v10.4s, v31.4s
-fadd v10.4s, v10.4s, v28.4s
 fmul v11.4s, v11.4s, v31.4s
-fadd v11.4s, v11.4s, v28.4s
-
 fmul v12.4s, v12.4s, v31.4s
-fadd v12.4s, v12.4s, v28.4s
 fmul v13.4s, v13.4s, v31.4s
-fadd v13.4s, v13.4s, v28.4s
 fmul v14.4s, v14.4s, v31.4s
-fadd v14.4s, v14.4s, v28.4s
 fmul v15.4s, v15.4s, v31.4s
+
+fadd v8.4s, v8.4s, v28.4s
+fadd v9.4s, v9.4s, v28.4s
+fadd v10.4s, v10.4s, v28.4s
+fadd v11.4s, v11.4s, v28.4s
+fadd v12.4s, v12.4s, v28.4s
+fadd v13.4s, v13.4s, v28.4s
+fadd v14.4s, v14.4s, v28.4s
 fadd v15.4s, v15.4s, v28.4s
 
 fcvtas v8.4s, v8.4s
@@ -207,8 +216,8 @@ sqxtn v19.4h, v14.4s
 sqxtn2 v19.8h, v15.4s
 
 smin v24.16b, v24.16b, v29.16b
-smax v24.16b, v24.16b, v30.16b
 smin v25.16b, v26.16b, v29.16b
+smax v24.16b, v24.16b, v30.16b
 smax v25.16b, v25.16b, v30.16b
 
 sqxtn v20.8b, v16.8h
@@ -217,18 +226,18 @@ sqxtn v21.8b, v18.8h
 sqxtn2 v21.16b, v19.8h
 
 smin v26.16b, v0.16b, v29.16b
-smax v26.16b, v26.16b, v30.16b
 smin v27.16b, v2.16b, v29.16b
+smax v26.16b, v26.16b, v30.16b
 smax v27.16b, v27.16b, v30.16b
 
 smin v12.16b, v4.16b, v29.16b
-smax v12.16b, v12.16b, v30.16b
 smin v13.16b, v6.16b, v29.16b
+smax v12.16b, v12.16b, v30.16b
 smax v13.16b, v13.16b, v30.16b
 
 smin v14.16b, v20.16b, v29.16b
-smax v14.16b, v14.16b, v30.16b
 smin v15.16b, v21.16b, v29.16b
+smax v14.16b, v14.16b, v30.16b
 smax v15.16b, v15.16b, v30.16b
 
 st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64
@@ -248,39 +257,37 @@ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64
 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
-fadd v3.4s, v3.4s, v28.4s
-
 fmul v4.4s, v4.4s, v31.4s
-fadd v4.4s, v4.4s, v28.4s
 fmul v5.4s, v5.4s, v31.4s
-fadd v5.4s, v5.4s, v28.4s
 fmul v6.4s, v6.4s, v31.4s
-fadd v6.4s, v6.4s, v28.4s
 fmul v7.4s, v7.4s, v31.4s
-fadd v7.4s, v7.4s, v28.4s
-
 fmul v8.4s, v8.4s, v31.4s
-fadd v8.4s, v8.4s, v28.4s
 fmul v9.4s, v9.4s, v31.4s
-fadd v9.4s, v9.4s, v28.4s
 fmul v10.4s, v10.4s, v31.4s
-fadd v10.4s, v10.4s, v28.4s
 fmul v11.4s, v11.4s, v31.4s
-fadd v11.4s, v11.4s, v28.4s
-
 fmul v12.4s, v12.4s, v31.4s
-fadd v12.4s, v12.4s, v28.4s
 fmul v13.4s, v13.4s, v31.4s
-fadd v13.4s, v13.4s, v28.4s
 fmul v14.4s, v14.4s, v31.4s
-fadd v14.4s, v14.4s, v28.4s
 fmul v15.4s, v15.4s, v31.4s
+
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
+fadd v3.4s, v3.4s, v28.4s
+fadd v4.4s, v4.4s, v28.4s
+fadd v5.4s, v5.4s, v28.4s
+fadd v6.4s, v6.4s, v28.4s
+fadd v7.4s, v7.4s, v28.4s
+fadd v8.4s, v8.4s, v28.4s
+fadd v9.4s, v9.4s, v28.4s
+fadd v10.4s, v10.4s, v28.4s
+fadd v11.4s, v11.4s, v28.4s
+fadd v12.4s, v12.4s, v28.4s
+fadd v13.4s, v13.4s, v28.4s
+fadd v14.4s, v14.4s, v28.4s
 fadd v15.4s, v15.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
@@ -350,21 +357,21 @@ FLLoop8:
 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
-fadd v3.4s, v3.4s, v28.4s
-
 fmul v4.4s, v4.4s, v31.4s
-fadd v4.4s, v4.4s, v28.4s
 fmul v5.4s, v5.4s, v31.4s
-fadd v5.4s, v5.4s, v28.4s
 fmul v6.4s, v6.4s, v31.4s
-fadd v6.4s, v6.4s, v28.4s
 fmul v7.4s, v7.4s, v31.4s
+
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
+fadd v3.4s, v3.4s, v28.4s
+fadd v4.4s, v4.4s, v28.4s
+fadd v5.4s, v5.4s, v28.4s
+fadd v6.4s, v6.4s, v28.4s
 fadd v7.4s, v7.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
@@ -405,15 +412,14 @@ cmp x2, #3
 ble FL1
 
 FLLoop4:
-ld1 {v0.4s, v1.4s}, [x0], #32
+ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
-ld1 {v2.4s, v3.4s}, [x0], #32
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
 fadd v3.4s, v3.4s, v28.4s
 
 fcvtas v0.4s, v0.4s

+ 124 - 0
source/backend/cpu/arm/arm64/MNNGRAYToC3Fast.S

@@ -0,0 +1,124 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNGRAYToC3Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNGRAYToC3Fast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L12:
+cmp x2, #12
+blt L8
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld1 {v21.16b, v22.16b}, [x0], #32
+sub x2, x2, #12
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+
+mov v23.16b, v21.16b
+mov v24.16b, v21.16b
+mov v25.16b, v21.16b
+
+mov v27.16b, v22.16b
+mov v28.16b, v22.16b
+mov v29.16b, v22.16b
+
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v13.16b, v14.16b, v15.16b}, [x1], #48
+st3 {v17.16b, v18.16b, v19.16b}, [x1], #48
+st3 {v23.16b, v24.16b, v25.16b}, [x1], #48
+st3 {v27.16b, v28.16b, v29.16b}, [x1], #48
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #8
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v13.16b, v14.16b, v15.16b}, [x1], #48
+st3 {v17.16b, v18.16b, v19.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld1 {v0.16b, v1.16b}, [x0], #32
+sub x2, x2, #4
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld1 {v0.16b}, [x0], #16
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+sub x2, x2, #2
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld1 {v0.8b}, [x0], #8
+mov v5.8b, v0.8b
+mov v6.8b, v0.8b
+mov v7.8b, v0.8b
+st3 {v5.8b, v6.8b, v7.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 139 - 0
source/backend/cpu/arm/arm64/MNNGRAYToC4Fast.S

@@ -0,0 +1,139 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNGRAYToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNGRAYToC4Fast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+movi v31.16b, #255
+
+L12:
+cmp x2, #12
+blt L8
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld1 {v21.16b, v22.16b}, [x0], #32
+sub x2, x2, #12
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+mov v12.16b, v31.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+mov v16.16b, v31.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+mov v20.16b, v31.16b
+
+mov v23.16b, v21.16b
+mov v24.16b, v21.16b
+mov v25.16b, v21.16b
+mov v26.16b, v31.16b
+
+mov v27.16b, v22.16b
+mov v28.16b, v22.16b
+mov v29.16b, v22.16b
+mov v30.16b, v31.16b
+
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+st4 {v9.16b, v10.16b, v11.16b, v12.16b}, [x1], #64
+st4 {v13.16b, v14.16b, v15.16b, v16.16b}, [x1], #64
+st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x1], #64
+st4 {v23.16b, v24.16b, v25.16b, v26.16b}, [x1], #64
+st4 {v27.16b, v28.16b, v29.16b, v30.16b}, [x1], #64
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #8
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+mov v12.16b, v31.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+mov v16.16b, v31.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+mov v20.16b, v31.16b
+
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+st4 {v9.16b, v10.16b, v11.16b, v12.16b}, [x1], #64
+st4 {v13.16b, v14.16b, v15.16b, v16.16b}, [x1], #64
+st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x1], #64
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld1 {v0.16b, v1.16b}, [x0], #32
+sub x2, x2, #4
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+mov v12.16b, v31.16b
+
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+st4 {v9.16b, v10.16b, v11.16b, v12.16b}, [x1], #64
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld1 {v0.16b}, [x0], #16
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+sub x2, x2, #2
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld1 {v0.8b}, [x0], #8
+mov v5.8b, v0.8b
+mov v6.8b, v0.8b
+mov v7.8b, v0.8b
+mov v8.8b, v31.8b
+st4 {v5.8b, v6.8b, v7.8b, v8.8b}, [x1], #32
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 27 - 30
source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S

@@ -127,7 +127,7 @@ stp x23, x24, [sp, #(16 * 8)]
 
 ldr x27, [x6, #64]  // blockNum
 mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x27, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
 
 ldr w28, [x6, #24]  // useInt8
 ldr x25, [x6, #40]  // xKernelSum
@@ -135,9 +135,9 @@ ldr x26, [x6, #48]  // weightQuantBias
 ldr x24, [x6, #80]  // extraScale
 
 add x23, x6, #16  // int8 max ptr
-mov x21, #4 // sizeof(int8_t) * UNIT
+mov x21, #4 // sizeof(int8_t) * pack
 cbnz w28, Start
-mov x21, #16 // sizeof(float) * UNIT
+mov x21, #16 // sizeof(float) * pack
 ldr x23, [x6, #56]  // fp32minmax
 Start:
 mov x22, #48 // src_steps
@@ -148,7 +148,6 @@ TILE_12:
     cmp x5, #2
     blt L4LoopDz_TILE_12
 L8LoopDz_TILE_12:
-    //ld1 {v0.4s, v1.4s}, [x9], #32 // bias
     mov x11, x1
     mov x13, x3
     mov x20, x0 // tag dst address
@@ -162,13 +161,13 @@ L8LoopDz_TILE_12:
     SET_BIAS v28, v29, v30, v31
 
     L8LoopSz_TILE_12:
-        ld1 {v3.16b}, [x2], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x2], #32 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x2], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
@@ -181,7 +180,7 @@ L8LoopDz_TILE_12:
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -194,8 +193,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    // add x2, x2, x15
-    add x2, x27, x15, LSL #1
+    add x2, x27, x15
     sub x5, x5, #2
 
     L8Tile12Quan:
@@ -352,7 +350,7 @@ L8LoopDz_TILE_12:
     L8Tile12LoopCheck:
     cmp x5, #1
     bgt L8LoopDz_TILE_12
-    blt End
+    cbz x5, End
 
 L4LoopDz_TILE_12:
     SET_BIAS v8, v9, v10, v11
@@ -360,7 +358,7 @@ L4LoopDz_TILE_12:
     SET_BIAS v16, v17, v18, v19
 
     L4LoopSz_TILE_12:
-        ld1 {v3.16b}, [x2], #16 // weight
+        ld1 {v3.16b}, [x2] // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x1], #48 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -370,6 +368,7 @@ L4LoopDz_TILE_12:
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
+        add x2, x2, #32 // weight offset=lp*hp=32
         subs x3, x3, #1
         .inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
         .inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
@@ -497,18 +496,18 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
 
     L8LoopSz_TILE_8:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x12], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -521,8 +520,7 @@ L8LoopDz_TILE_8:
         bne L8LoopSz_TILE_8
 
     L8LoopSzEnd_TILE_8:
-    //add x12, x12, x15
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile8Quan:
@@ -652,12 +650,13 @@ L4LoopDz_TILE_8:
     SET_BIAS v12, v13, v14, v15
 
     L4LoopSz_TILE_8:
-        ld1 {v3.16b}, [x12], #16 // weight
+        ld1 {v3.16b}, [x12] // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
+        add x12, x12, #32 // weight offset=lp*hp
         subs x13, x13, #1
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
@@ -772,15 +771,14 @@ L8LoopDz_TILE_4:
     SET_BIAS v12, v13, v14, v15
 
     L8LoopSz_TILE_4:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
+
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -788,8 +786,7 @@ L8LoopDz_TILE_4:
         bne L8LoopSz_TILE_4
 
     L8LoopSzEnd_TILE_4:
-    //add x12, x12, x15
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile4Quan:
@@ -879,9 +876,10 @@ L4LoopDz_TILE_4:
     SET_BIAS v8, v9, v10, v11
 
     L4LoopSz_TILE_4:
-        ld1 {v3.16b}, [x12], #16 // weight
+        ld1 {v3.16b}, [x12]      // weight
         ld1 {v0.16b}, [x11], x22 // src
         subs x13, x13, #1
+        add x12, x12, #32 // weight offset = lp*hp
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
@@ -974,17 +972,15 @@ L8LoopDz_TILE_1:
     movi v8.16b, #0
     movi v9.16b, #0
     L8LoopSz_TILE_1:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
         bne L8LoopSz_TILE_1
 
     L8LoopSzEnd_TILE_1:
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile1Quan:
@@ -1067,9 +1063,10 @@ L4LoopDz_TILE_1:
     mov x13, x3
     movi v8.16b, #0
     L4LoopSz_TILE_1:
-        ld1 {v3.16b}, [x12], #16 // weight
+        ld1 {v3.16b}, [x12] // weight
         ld1 {v0.s}[0], [x11], x22 // src
         subs x13, x13, #1
+        add x12, x12, #32 // weight offset = lp*hp
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         bne L4LoopSz_TILE_1
 
@@ -1132,11 +1129,11 @@ cbz x24, Tile1_End_Offset
 add x24, x24, #4
 
 Tile1_End_Offset:
-    sub x7, x7, #1
+    subs x7, x7, #1
     add x0, x0, x21
     add x1, x1, #4
     add x25, x25, #4
-    b TILE_1
+    bne TILE_1
 
 End:
 ldp x23, x24, [sp, #(16 * 8)]

+ 107 - 0
source/backend/cpu/arm/arm64/MNNPackC2.S

@@ -0,0 +1,107 @@
+//
+//  MNNPackInt8C2.S
+//  MNN
+//
+//  Created by MNN on 2019/02/02.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNPackInt8C2
+//void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int32_t* areaOffset)
+//Auto load:
+//x0:dst, x1:src, x2:area, x3:depth, x4: areaOffset, x5: areaOffset
+
+ldr w10, [x4, #4] // dstDepthOffset
+ldr w9, [x4, #0] // srcDepthOffset
+uxtw x10, w10
+uxtw x9, w9
+
+//x12: srcDepthOffset:area*sizeof(float)
+mov x12, #4
+mul x12, x9, x12
+
+//r10 -> 2 * (dstArea * sizeof(float) - area * sizeof(float))
+mov x5, #8
+sub x10, x10, x2
+mul x10, x5, x10
+
+//r9 -> (srcArea * sizeof(float) - area * sizeof(float))
+mov x6, #4
+sub x9, x9, x2
+mul x9, x6, x9
+
+UpL2:
+cmp x3, #1
+ble UpL1
+
+UpL2Loop:
+add x5, x1, x12
+mov x8, x2
+cmp x8, #3
+ble UpL2AreaRemain
+UpL2AreaLoop:
+ld1 {v0.4s}, [x1], #16
+ld1 {v1.4s}, [x5], #16
+
+st2 {v0.4s, v1.4s}, [x0], #32
+sub x8, x8, #4
+cmp x8, #4
+bge UpL2AreaLoop
+
+cmp x8, #0
+beq UpL2AreaRemainEnd
+UpL2AreaRemain:
+ld1 {v0.s}[0], [x1], #4
+ld1 {v0.s}[1], [x5], #4
+
+st1 {v0.d}[0], [x0], #8
+
+subs x8, x8, #1
+bne UpL2AreaRemain
+
+UpL2AreaRemainEnd:
+sub x3, x3, #2
+add x1, x5, x9
+cmp x3, #2
+add x0, x10, x0
+bge UpL2Loop
+
+UpL1:
+cmp x3, #0
+beq UpEnd
+mov x8, x2
+cmp x8, #3
+ble UpL1AreaRemain
+UpL1AreaLoop:
+ld1 {v0.4s}, [x1], #16
+movi v1.4s, #0
+
+st2 {v0.4s, v1.4s}, [x0], #32
+sub x8, x8, #4
+cmp x8, #4
+bge UpL1AreaLoop
+
+cmp x8, #0
+beq UpL1AreaRemainEnd
+UpL1AreaRemain:
+movi v0.4s, #0
+ld1 {v0.s}[0], [x1], #4
+
+st1 {v0.d}[0], [x0], #8
+
+subs x8, x8, #1
+bne UpL1AreaRemain
+
+UpL1AreaRemainEnd:
+
+UpEnd:
+
+ret
+
+#endif

+ 147 - 0
source/backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S

@@ -0,0 +1,147 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNRGBAToBGRAFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBAToBGRAFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L10:
+cmp x2, #10
+blt L8
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
+sub x2, x2, #10
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+mov v23.16b, v7.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+mov v27.16b, v11.16b
+
+mov v0.16b, v14.16b
+mov v1.16b, v13.16b
+mov v2.16b, v12.16b
+mov v3.16b, v15.16b
+
+mov v4.16b, v30.16b
+mov v5.16b, v29.16b
+mov v6.16b, v28.16b
+mov v7.16b, v31.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+
+b L10
+
+
+L8:
+cmp x2, #8
+blt L4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+sub x2, x2, #8
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+mov v23.16b, v7.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+mov v27.16b, v11.16b
+
+mov v28.16b, v14.16b
+mov v29.16b, v13.16b
+mov v30.16b, v12.16b
+mov v31.16b, v15.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
+st4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], #64
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+sub x2, x2, #4
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+mov v23.16b, v7.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #2
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+mov v16.8b, v2.8b
+mov v17.8b, v1.8b
+mov v18.8b, v0.8b
+mov v19.8b, v3.8b
+
+st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 134 - 0
source/backend/cpu/arm/arm64/MNNRGBAToBGRFast.S

@@ -0,0 +1,134 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNRGBAToBGRFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBAToBGRFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L10:
+cmp x2, #10
+blt L8
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
+sub x2, x2, #10
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+
+mov v0.16b, v14.16b
+mov v1.16b, v13.16b
+mov v2.16b, v12.16b
+
+mov v4.16b, v30.16b
+mov v5.16b, v29.16b
+mov v6.16b, v28.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v20.16b, v21.16b, v22.16b}, [x1], #48
+st3 {v24.16b, v25.16b, v26.16b}, [x1], #48
+st3 {v0.16b, v1.16b, v2.16b}, [x1], #48
+st3 {v4.16b, v5.16b, v6.16b}, [x1], #48
+
+b L10
+
+
+L8:
+cmp x2, #8
+blt L4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+sub x2, x2, #8
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+
+mov v28.16b, v14.16b
+mov v29.16b, v13.16b
+mov v30.16b, v12.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v20.16b, v21.16b, v22.16b}, [x1], #48
+st3 {v24.16b, v25.16b, v26.16b}, [x1], #48
+st3 {v28.16b, v29.16b, v30.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+sub x2, x2, #4
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v20.16b, v21.16b, v22.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #2
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+mov v16.8b, v2.8b
+mov v17.8b, v1.8b
+mov v18.8b, v0.8b
+
+st3 {v16.8b, v17.8b, v18.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 96 - 0
source/backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S

@@ -0,0 +1,96 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBAToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #19
+movi v30.16b, #38
+movi v31.16b, #7
+
+// b*7
+// g*38
+// r*19
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v14.16b, v15.16b, v16.16b, v17.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b
+umlal v4.8h, v1.8b, v30.8b
+umlal v4.8h, v2.8b, v31.8b
+
+umull2 v7.8h, v0.16b, v29.16b
+umlal2 v7.8h, v1.16b, v30.16b
+umlal2 v7.8h, v2.16b, v31.16b
+
+umull v18.8h, v14.8b, v29.8b
+umlal v18.8h, v15.8b, v30.8b
+umlal v18.8h, v16.8b, v31.8b
+
+umull2 v21.8h, v14.16b, v29.16b
+umlal2 v21.8h, v15.16b, v30.16b
+umlal2 v21.8h, v16.16b, v31.16b
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b
+umlal v4.8h, v1.8b, v30.8b
+umlal v4.8h, v2.8b, v31.8b
+
+umull2 v7.8h, v0.16b, v29.16b
+umlal2 v7.8h, v1.16b, v30.16b
+umlal2 v7.8h, v2.16b, v31.16b
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+umull v4.8h, v0.8b, v29.8b
+umlal v4.8h, v1.8b, v30.8b
+umlal v4.8h, v2.8b, v31.8b
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 126 - 0
source/backend/cpu/arm/arm64/MNNRGBToBGR.S

@@ -0,0 +1,126 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNRGBToBGR(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToBGRC8
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L12:
+cmp x2, #12
+blt L8
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v6.16b, v7.16b, v8.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v15.16b, v16.16b, v17.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+ld3 {v27.16b, v28.16b, v29.16b}, [x0], #48
+sub x2, x2, #12
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+mov v9.16b, v8.16b
+mov v10.16b, v7.16b
+mov v11.16b, v6.16b
+
+mov v18.16b, v14.16b
+mov v19.16b, v13.16b
+mov v20.16b, v12.16b
+mov v21.16b, v17.16b
+mov v22.16b, v16.16b
+mov v23.16b, v15.16b
+
+mov v0.16b, v26.16b
+mov v1.16b, v25.16b
+mov v2.16b, v24.16b
+mov v6.16b, v29.16b
+mov v7.16b, v28.16b
+mov v8.16b, v27.16b
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v18.16b, v19.16b, v20.16b}, [x1], #48
+st3 {v21.16b, v22.16b, v23.16b}, [x1], #48
+st3 {v0.16b, v1.16b, v2.16b}, [x1], #48
+st3 {v6.16b, v7.16b, v8.16b}, [x1], #48
+
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v6.16b, v7.16b, v8.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v15.16b, v16.16b, v17.16b}, [x0], #48
+sub x2, x2, #8
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+mov v9.16b, v8.16b
+mov v10.16b, v7.16b
+mov v11.16b, v6.16b
+
+mov v18.16b, v14.16b
+mov v19.16b, v13.16b
+mov v20.16b, v12.16b
+mov v21.16b, v17.16b
+mov v22.16b, v16.16b
+mov v23.16b, v15.16b
+
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v18.16b, v19.16b, v20.16b}, [x1], #48
+st3 {v21.16b, v22.16b, v23.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v6.16b, v7.16b, v8.16b}, [x0], #48
+sub x2, x2, #4
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+mov v9.16b, v8.16b
+mov v10.16b, v7.16b
+mov v11.16b, v6.16b
+
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+sub x2, x2, #2
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+mov v3.8b, v2.8b
+mov v4.8b, v1.8b
+mov v5.8b, v0.8b
+st3 {v3.8b, v4.8b, v5.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 169 - 0
source/backend/cpu/arm/arm64/MNNRGBToBGR555.S

@@ -0,0 +1,169 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToBGR555Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v13.16b, v13.16b, #3  // b >> 3
+and v24.16b, v24.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v31.16b // g & ~7
+ushr v26.16b, v26.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v0.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v13.16b
+
+ushll v6.8h, v24.8b, #7
+ushll v7.8h, v25.8b, #2
+uxtl v27.8h, v26.8b
+ushll2 v28.8h, v24.16b, #7
+ushll2 v29.8h, v25.16b, #2
+uxtl2 v30.8h, v26.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v13.16b, v13.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v0.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v13.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+sub x2, x2, #2
+ushr v2.16b, v2.16b, #3  // b >> 3
+
+ushll v3.8h, v0.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v2.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v0.8b, v0.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v31.8b // g & ~7
+ushr v2.8b, v2.8b, #3  // b >> 3
+ushll v0.8h, v0.8b, #7
+ushll v1.8h, v1.8b, #2
+uxtl v2.8h, v2.8b
+orr v0.16b, v0.16b, v1.16b
+orr v0.16b, v0.16b, v2.16b
+
+st1 {v0.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 187 - 0
source/backend/cpu/arm/arm64/MNNRGBToBGR565.S

@@ -0,0 +1,187 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToBGR565Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+movi v30.16b, #4
+neg v30.16b, v30.16b
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v13.16b, v13.16b, #3  // b >> 3
+and v24.16b, v24.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v30.16b // g & ~3
+ushr v26.16b, v26.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v0.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v13.16b
+
+ushll v6.8h, v24.8b, #7
+shl v6.8h, v6.8h, #1
+ushll v7.8h, v25.8b, #3
+uxtl v27.8h, v26.8b
+ushll2 v28.8h, v24.16b, #7
+shl v28.8h, v28.8h, #1
+ushll2 v29.8h, v25.16b, #3
+uxtl2 v30.8h, v26.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+movi v30.16b, #4
+neg v30.16b, v30.16b
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v13.16b, v13.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v0.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v13.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~7
+sub x2, x2, #2
+ushr v2.16b, v2.16b, #3  // b >> 3
+
+ushll v3.8h, v0.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v2.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v0.8b, v0.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v30.8b // g & ~7
+ushr v2.8b, v2.8b, #3  // b >> 3
+ushll v0.8h, v0.8b, #7
+shl v0.8h, v0.8h, #1
+ushll v1.8h, v1.8b, #3
+uxtl v2.8h, v2.8b
+orr v0.16b, v0.16b, v1.16b
+orr v0.16b, v0.16b, v2.16b
+
+st1 {v0.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 92 - 0
source/backend/cpu/arm/arm64/MNNRGBToGRAYFast.S

@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #19
+movi v30.16b, #38
+movi v31.16b, #7
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v14.16b, v15.16b, v16.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+umull v18.8h, v14.8b, v29.8b // b*7
+umlal v18.8h, v15.8b, v30.8b // g*38
+umlal v18.8h, v16.8b, v31.8b // r*19
+
+umull2 v21.8h, v14.16b, v29.16b // b*7
+umlal2 v21.8h, v15.16b, v30.16b // g*38
+umlal2 v21.8h, v16.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif

+ 0 - 0
source/backend/cpu/arm/arm64/MNNSamplerC3BilinearOpt.S


Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio