MNNC3ToC4Fast.S 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #ifdef __aarch64__
  2. #include "MNNAsmGlobal.h"
  3. .text
  4. .align 5
  5. // void MNNC3ToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
  6. asm_function MNNC3ToC4Fast
  7. // x0: source, x1: dest, x2: count
  8. stp d14, d15, [sp, #(-16 * 4)]!
  9. stp d12, d13, [sp, #(16 * 1)]
  10. stp d10, d11, [sp, #(16 * 2)]
  11. stp d8, d9, [sp, #(16 * 3)]
  12. movi v3.16b, #255
  13. movi v7.16b, #255
  14. movi v11.16b, #255
  15. movi v15.16b, #255
  16. movi v19.16b, #255
  17. movi v23.16b, #255
  18. movi v27.16b, #255
  19. movi v31.16b, #255
  20. L16:
  21. cmp x2, #16
  22. blt L12
  23. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  24. ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
  25. ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
  26. ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
  27. ld3 {v16.16b, v17.16b, v18.16b}, [x0], #48
  28. ld3 {v20.16b, v21.16b, v22.16b}, [x0], #48
  29. ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
  30. ld3 {v28.16b, v29.16b, v30.16b}, [x0], #48
  31. sub x2, x2, #16
  32. st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
  33. st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
  34. st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
  35. st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
  36. st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
  37. st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
  38. st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
  39. st4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], #64
  40. b L16
  41. L12:
  42. cmp x2, #12
  43. blt L8
  44. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  45. ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
  46. ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
  47. ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
  48. ld3 {v16.16b, v17.16b, v18.16b}, [x0], #48
  49. ld3 {v20.16b, v21.16b, v22.16b}, [x0], #48
  50. sub x2, x2, #12
  51. st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
  52. st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
  53. st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
  54. st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
  55. st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
  56. st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
  57. b L12
  58. L8:
  59. cmp x2, #8
  60. blt L4
  61. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  62. ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
  63. ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
  64. ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
  65. sub x2, x2, #8
  66. st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
  67. st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
  68. st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
  69. st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
  70. b L8
  71. L4:
  72. cmp x2, #4
  73. blt L2
  74. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  75. ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
  76. sub x2, x2, #4
  77. st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
  78. st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
  79. b L4
  80. L2:
  81. cmp x2, #2
  82. blt L1
  83. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  84. sub x2, x2, #2
  85. st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
  86. b L2
  87. L1:
  88. cmp x2, #1
  89. blt End
  90. ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
  91. st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], #32
  92. End:
  93. ldp d8, d9, [sp, #(16 * 3)]
  94. ldp d10, d11, [sp, #(16 * 2)]
  95. ldp d12, d13, [sp, #(16 * 1)]
  96. ldp d14, d15, [sp], #(16 * 4)
  97. ret
  98. #endif