MNNBGRToBGR565.S 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. #ifdef __aarch64__
  2. #include "MNNAsmGlobal.h"
  3. .text
  4. .align 5
  5. // void MNNBGRToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
  6. asm_function MNNBGRToBGR565Fast
  7. // x0: source, x1: dest, x2: count, x3: c
  8. stp d14, d15, [sp, #(-16 * 4)]!
  9. stp d12, d13, [sp, #(16 * 1)]
  10. stp d10, d11, [sp, #(16 * 2)]
  11. stp d8, d9, [sp, #(16 * 3)]
  12. movi v31.16b, #8
  13. neg v31.16b, v31.16b
  14. L6:
  15. cmp x2, #6
  16. blt L4
  17. movi v30.16b, #4
  18. neg v30.16b, v30.16b
  19. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  20. ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
  21. ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
  22. and v2.16b, v2.16b, v31.16b // r & ~7
  23. and v1.16b, v1.16b, v30.16b // g & ~3
  24. ushr v0.16b, v0.16b, #3 // b >> 3
  25. and v13.16b, v13.16b, v31.16b // r & ~7
  26. and v12.16b, v12.16b, v30.16b // g & ~3
  27. ushr v11.16b, v11.16b, #3 // b >> 3
  28. and v26.16b, v26.16b, v31.16b // r & ~7
  29. and v25.16b, v25.16b, v30.16b // g & ~3
  30. ushr v24.16b, v24.16b, #3 // b >> 3
  31. sub x2, x2, #6
  32. ushll v3.8h, v2.8b, #7
  33. shl v3.8h, v3.8h, #1
  34. ushll v4.8h, v1.8b, #3
  35. uxtl v5.8h, v0.8b
  36. ushll2 v8.8h, v2.16b, #7
  37. shl v8.8h, v8.8h, #1
  38. ushll2 v9.8h, v1.16b, #3
  39. uxtl2 v10.8h, v0.16b
  40. ushll v14.8h, v13.8b, #7
  41. shl v14.8h, v14.8h, #1
  42. ushll v15.8h, v12.8b, #3
  43. uxtl v16.8h, v11.8b
  44. ushll2 v17.8h, v13.16b, #7
  45. shl v17.8h, v17.8h, #1
  46. ushll2 v18.8h, v12.16b, #3
  47. uxtl2 v19.8h, v11.16b
  48. ushll v6.8h, v26.8b, #7
  49. shl v6.8h, v6.8h, #1
  50. ushll v7.8h, v25.8b, #3
  51. uxtl v27.8h, v24.8b
  52. ushll2 v28.8h, v26.16b, #7
  53. shl v28.8h, v28.8h, #1
  54. ushll2 v29.8h, v25.16b, #3
  55. uxtl2 v30.8h, v24.16b
  56. orr v0.16b, v3.16b, v4.16b
  57. orr v0.16b, v0.16b, v5.16b
  58. orr v1.16b, v8.16b, v9.16b
  59. orr v1.16b, v1.16b, v10.16b
  60. orr v2.16b, v14.16b, v15.16b
  61. orr v2.16b, v2.16b, v16.16b
  62. orr v3.16b, v17.16b, v18.16b
  63. orr v3.16b, v3.16b, v19.16b
  64. orr v4.16b, v6.16b, v7.16b
  65. orr v4.16b, v4.16b, v27.16b
  66. orr v5.16b, v28.16b, v29.16b
  67. orr v5.16b, v5.16b, v30.16b
  68. st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
  69. st1 {v4.8h, v5.8h}, [x1], #32
  70. b L6
  71. L4:
  72. movi v30.16b, #4
  73. neg v30.16b, v30.16b
  74. cmp x2, #4
  75. blt L2
  76. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  77. ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
  78. and v2.16b, v2.16b, v31.16b // r & ~7
  79. and v1.16b, v1.16b, v30.16b // g & ~3
  80. ushr v0.16b, v0.16b, #3 // b >> 3
  81. and v13.16b, v13.16b, v31.16b // r & ~7
  82. and v12.16b, v12.16b, v30.16b // g & ~3
  83. ushr v11.16b, v11.16b, #3 // b >> 3
  84. sub x2, x2, #4
  85. ushll v3.8h, v2.8b, #7
  86. shl v3.8h, v3.8h, #1
  87. ushll v4.8h, v1.8b, #3
  88. uxtl v5.8h, v0.8b
  89. ushll2 v8.8h, v2.16b, #7
  90. shl v8.8h, v8.8h, #1
  91. ushll2 v9.8h, v1.16b, #3
  92. uxtl2 v10.8h, v0.16b
  93. ushll v14.8h, v13.8b, #7
  94. shl v14.8h, v14.8h, #1
  95. ushll v15.8h, v12.8b, #3
  96. uxtl v16.8h, v11.8b
  97. ushll2 v17.8h, v13.16b, #7
  98. shl v17.8h, v17.8h, #1
  99. ushll2 v18.8h, v12.16b, #3
  100. uxtl2 v19.8h, v11.16b
  101. orr v20.16b, v3.16b, v4.16b
  102. orr v20.16b, v20.16b, v5.16b
  103. orr v21.16b, v8.16b, v9.16b
  104. orr v21.16b, v21.16b, v10.16b
  105. orr v22.16b, v14.16b, v15.16b
  106. orr v22.16b, v22.16b, v16.16b
  107. orr v23.16b, v17.16b, v18.16b
  108. orr v23.16b, v23.16b, v19.16b
  109. st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
  110. b L4
  111. L2:
  112. cmp x2, #2
  113. blt L1
  114. ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
  115. and v2.16b, v2.16b, v31.16b // r & ~7
  116. and v1.16b, v1.16b, v30.16b // g & ~3
  117. sub x2, x2, #2
  118. ushr v0.16b, v0.16b, #3 // b >> 3
  119. ushll v3.8h, v2.8b, #7
  120. shl v3.8h, v3.8h, #1
  121. ushll v4.8h, v1.8b, #3
  122. uxtl v5.8h, v0.8b
  123. ushll2 v8.8h, v2.16b, #7
  124. shl v8.8h, v8.8h, #1
  125. ushll2 v9.8h, v1.16b, #3
  126. uxtl2 v10.8h, v0.16b
  127. orr v6.16b, v3.16b, v4.16b
  128. orr v6.16b, v6.16b, v5.16b
  129. orr v7.16b, v8.16b, v9.16b
  130. orr v7.16b, v7.16b, v10.16b
  131. st1 {v6.8h, v7.8h}, [x1], #32
  132. b L2
  133. L1:
  134. cmp x2, #1
  135. blt End
  136. ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
  137. and v2.8b, v2.8b, v31.8b // r & ~7
  138. and v1.8b, v1.8b, v30.8b // g & ~3
  139. ushr v0.8b, v0.8b, #3 // b >> 3
  140. ushll v2.8h, v2.8b, #7
  141. shl v2.8h, v2.8h, #1
  142. ushll v1.8h, v1.8b, #3
  143. uxtl v0.8h, v0.8b
  144. orr v3.16b, v0.16b, v1.16b
  145. orr v3.16b, v3.16b, v2.16b
  146. st1 {v3.8h}, [x1], #16
  147. End:
  148. ldp d8, d9, [sp, #(16 * 3)]
  149. ldp d10, d11, [sp, #(16 * 2)]
  150. ldp d12, d13, [sp, #(16 * 1)]
  151. ldp d14, d15, [sp], #(16 * 4)
  152. ret
  153. #endif