MNNSamplerC3BilinearOpt.S 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. //
  2. // MNNSamplerC3BilinearOpt.S
  3. // MNN
  4. //
  5. // Created by MNN on 2018/11/20.
  6. // Copyright © 2018, Alibaba Group Holding Limited
  7. //
  8. #ifdef __aarch64__
  9. #include "MNNAsmGlobal.h"
  10. .text
  11. .align 5
  12. //void MNNSamplerC3BilinearOpt(const unsigned char* source, unsigned char* dest, float* points, size_t count, size_t iw, size_t ih, size_t yStride);
  13. asm_function MNNSamplerC3BilinearOpt
  14. //Auto: x0:source, x1:dest, x2:points, x3:count
  15. //x4: xMax, x5: yMax, x6:yStride
  16. movi v19.4s, #0
  17. ld1 {v0.2s, v1.2s}, [x2]
  18. //L4:
  19. //cmp x3, #4
  20. //blt L1
  21. //dup v16.4s, w4
  22. //dup v17.4s, w5
  23. //movi v3.2s, #4
  24. //scvtf v3.2s, v3.2s
  25. //fmul v3.2s, v3.2s, v1.2s
  26. //dup v25.4s, v3.s[0]
  27. //dup v26.4s, v3.s[1]
  28. //
  29. //fadd v2.2s, v0.2s, v1.2s
  30. //mov v4.s[0], v0.s[0]
  31. //fadd v3.2s, v2.2s, v1.2s
  32. //mov v5.s[0], v0.s[1]
  33. //mov v4.s[1], v2.s[0]
  34. //mov v5.s[1], v2.s[1]
  35. //mov v4.s[2], v3.s[0]
  36. //fadd v2.2s, v3.2s, v1.2s
  37. //mov v5.s[2], v3.s[1]
  38. //mov v4.s[3], v2.s[0]
  39. //mov v5.s[3], v2.s[1]
  40. //
  41. //dup v23.4s, w6
  42. //movi v24.4s, #4
  43. //dup v22.2d, x0
  44. //
  45. //L4Loop:
  46. //fcvtns v6.4s, v4.4s
  47. //fcvtns v7.4s, v5.4s
  48. //
  49. //smin v6.4s, v6.4s, v16.4s
  50. //smin v7.4s, v7.4s, v17.4s
  51. //smax v6.4s, v6.4s, v19.4s
  52. //smax v7.4s, v7.4s, v19.4s
  53. //
  54. //mul v7.4s, v7.4s, v23.4s
  55. //mla v7.4s, v6.4s, v24.4s
  56. //uxtl v6.2d, v7.2s
  57. //uxtl2 v7.2d, v7.4s
  58. //add v6.2d, v6.2d, v22.2d
  59. //add v7.2d, v7.2d, v22.2d
  60. //
  61. //mov x12, v6.d[0]
  62. //mov x13, v6.d[1]
  63. //ld1 {v3.s}[0], [x12]
  64. //mov x12, v7.d[0]
  65. //ld1 {v3.s}[1], [x13]
  66. //fadd v5.4s, v26.4s, v5.4s
  67. //mov x13, v7.d[1]
  68. //ld1 {v3.s}[2], [x12]
  69. //fadd v4.4s, v25.4s, v4.4s
  70. //ld1 {v3.s}[3], [x13]
  71. //
  72. //st1 {v3.4s}, [x1], #16
  73. //
  74. //
  75. //sub x3, x3, #4
  76. //cmp x3, #4
  77. //bge L4Loop
  78. //
  79. //mov v0.s[0], v4.s[0]
  80. //mov v0.s[1], v5.s[0]
  81. L1:
  82. cmp x3, #0
  83. beq End
  84. mov v16.s[0], w4
  85. mov v16.s[1], w5 // v16:[xMax, yMax]
  86. mov w12, #3
  87. mov v7.s[0], w12 // bpp=4
  88. mov v7.s[1], w6 // yStride
  89. dup v20.2d, x0
  90. L1Loop:
  91. fcvtzs v2.2s, v0.2s // [x0, y0]
  92. frintm v4.2s, v0.2s
  93. smax v2.2s, v2.2s, v19.2s // max(0, y)
  94. fcvtps v3.2s, v0.2s // [x1, y1]
  95. fabd v4.2s, v0.2s, v4.2s // (xF, yF)
  96. smax v3.2s, v3.2s, v19.2s
  97. smin v2.2s, v2.2s, v16.2s
  98. smin v3.2s, v3.2s, v16.2s
  99. mul v2.2s, v2.2s, v7.2s // [bpp * x0, y0 * yStride]
  100. mul v3.2s, v3.2s, v7.2s // [bpp * x1, y1 * yStride]
  101. mov v2.s[2], v3.s[0] // v2: [bpp*x0, y0*yStride, bpp*x1, y0*yStride]
  102. mov v3.s[2], v2.s[0] // v3: [bpp*x1, y1*yStride, bpp*x0, y1*yStride]
  103. mov v2.s[3], v2.s[1]
  104. mov v3.s[3], v3.s[1]
  105. uaddlp v2.2d, v2.4s // [c00, c01]
  106. uaddlp v3.2d, v3.4s // [c11, c10]
  107. add v2.2d, v20.2d, v2.2d
  108. add v3.2d, v20.2d, v3.2d
  109. mov x4, v2.d[0]
  110. mov x5, v2.d[1]
  111. ld1 {v5.h}[0], [x4], #2
  112. ld1 {v5.b}[2], [x4]
  113. ld1 {v5.h}[2], [x5], #2
  114. ld1 {v5.b}[6], [x5]
  115. mov x4, v3.d[0]
  116. uxtl v5.8h, v5.8b
  117. mov x5, v3.d[1]
  118. ld1 {v6.h}[0], [x4], #2
  119. ld1 {v6.b}[2], [x4]
  120. ld1 {v6.h}[2], [x5], #2
  121. ld1 {v6.b}[6], [x5]
  122. uxtl v6.8h, v6.8b
  123. //Now v2, v3 is of no use
  124. //v2: LT, v3: RT, v5: LB, v6:BT
  125. uxtl v2.4s, v5.4h // c00
  126. uxtl2 v3.4s, v5.8h // c01
  127. ucvtf v2.4s, v2.4s
  128. uxtl v5.4s, v6.4h // c11
  129. ucvtf v3.4s, v3.4s
  130. uxtl2 v6.4s, v6.8h // c10
  131. ucvtf v5.4s, v5.4s
  132. ucvtf v6.4s, v6.4s
  133. fsub v3.4s, v3.4s, v2.4s
  134. fsub v5.4s, v5.4s, v6.4s
  135. fmla v2.4s, v3.4s, v4.s[0] // (c01-c00)*xF+c00
  136. fmla v6.4s, v5.4s, v4.s[0] // (c11-c10)*xF+c10
  137. fsub v6.4s, v6.4s, v2.4s
  138. fmla v2.4s, v6.4s, v4.s[1]
  139. fcvtzs v2.4s, v2.4s
  140. uqxtn v2.4h, v2.4s
  141. uqxtn v2.8b, v2.8h
  142. fadd v0.2s, v0.2s, v1.2s
  143. subs x3, x3, #1
  144. st1 {v2.h}[0], [x1], #2
  145. st1 {v2.b}[0], [x1], #1
  146. bne L1Loop
  147. End:
  148. ret
  149. #endif