123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- //
- // MNNSamplerC3BilinearOpt.S
- // MNN
- //
- // Created by MNN on 2018/11/20.
- // Copyright © 2018, Alibaba Group Holding Limited
- //
- #ifdef __aarch64__
- #include "MNNAsmGlobal.h"
- .text
- .align 5
- //void MNNSamplerC3BilinearOpt(const unsigned char* source, unsigned char* dest, float* points, size_t count, size_t iw, size_t ih, size_t yStride);
- asm_function MNNSamplerC3BilinearOpt
- //Auto: x0:source, x1:dest, x2:points, x3:count
- //x4: xMax, x5: yMax, x6:yStride
- movi v19.4s, #0
- ld1 {v0.2s, v1.2s}, [x2]
- //L4:
- //cmp x3, #4
- //blt L1
- //dup v16.4s, w4
- //dup v17.4s, w5
- //movi v3.2s, #4
- //scvtf v3.2s, v3.2s
- //fmul v3.2s, v3.2s, v1.2s
- //dup v25.4s, v3.s[0]
- //dup v26.4s, v3.s[1]
- //
- //fadd v2.2s, v0.2s, v1.2s
- //mov v4.s[0], v0.s[0]
- //fadd v3.2s, v2.2s, v1.2s
- //mov v5.s[0], v0.s[1]
- //mov v4.s[1], v2.s[0]
- //mov v5.s[1], v2.s[1]
- //mov v4.s[2], v3.s[0]
- //fadd v2.2s, v3.2s, v1.2s
- //mov v5.s[2], v3.s[1]
- //mov v4.s[3], v2.s[0]
- //mov v5.s[3], v2.s[1]
- //
- //dup v23.4s, w6
- //movi v24.4s, #4
- //dup v22.2d, x0
- //
- //L4Loop:
- //fcvtns v6.4s, v4.4s
- //fcvtns v7.4s, v5.4s
- //
- //smin v6.4s, v6.4s, v16.4s
- //smin v7.4s, v7.4s, v17.4s
- //smax v6.4s, v6.4s, v19.4s
- //smax v7.4s, v7.4s, v19.4s
- //
- //mul v7.4s, v7.4s, v23.4s
- //mla v7.4s, v6.4s, v24.4s
- //uxtl v6.2d, v7.2s
- //uxtl2 v7.2d, v7.4s
- //add v6.2d, v6.2d, v22.2d
- //add v7.2d, v7.2d, v22.2d
- //
- //mov x12, v6.d[0]
- //mov x13, v6.d[1]
- //ld1 {v3.s}[0], [x12]
- //mov x12, v7.d[0]
- //ld1 {v3.s}[1], [x13]
- //fadd v5.4s, v26.4s, v5.4s
- //mov x13, v7.d[1]
- //ld1 {v3.s}[2], [x12]
- //fadd v4.4s, v25.4s, v4.4s
- //ld1 {v3.s}[3], [x13]
- //
- //st1 {v3.4s}, [x1], #16
- //
- //
- //sub x3, x3, #4
- //cmp x3, #4
- //bge L4Loop
- //
- //mov v0.s[0], v4.s[0]
- //mov v0.s[1], v5.s[0]
- L1:
- cmp x3, #0
- beq End
- mov v16.s[0], w4
- mov v16.s[1], w5 // v16:[xMax, yMax]
- mov w12, #3
- mov v7.s[0], w12 // bpp=4
- mov v7.s[1], w6 // yStride
- dup v20.2d, x0
- L1Loop:
- fcvtzs v2.2s, v0.2s // [x0, y0]
- frintm v4.2s, v0.2s
- smax v2.2s, v2.2s, v19.2s // max(0, y)
- fcvtps v3.2s, v0.2s // [x1, y1]
- fabd v4.2s, v0.2s, v4.2s // (xF, yF)
- smax v3.2s, v3.2s, v19.2s
- smin v2.2s, v2.2s, v16.2s
- smin v3.2s, v3.2s, v16.2s
- mul v2.2s, v2.2s, v7.2s // [bpp * x0, y0 * yStride]
- mul v3.2s, v3.2s, v7.2s // [bpp * x1, y1 * yStride]
- mov v2.s[2], v3.s[0] // v2: [bpp*x0, y0*yStride, bpp*x1, y0*yStride]
- mov v3.s[2], v2.s[0] // v3: [bpp*x1, y1*yStride, bpp*x0, y1*yStride]
- mov v2.s[3], v2.s[1]
- mov v3.s[3], v3.s[1]
- uaddlp v2.2d, v2.4s // [c00, c01]
- uaddlp v3.2d, v3.4s // [c11, c10]
- add v2.2d, v20.2d, v2.2d
- add v3.2d, v20.2d, v3.2d
- mov x4, v2.d[0]
- mov x5, v2.d[1]
- ld1 {v5.h}[0], [x4], #2
- ld1 {v5.b}[2], [x4]
- ld1 {v5.h}[2], [x5], #2
- ld1 {v5.b}[6], [x5]
- mov x4, v3.d[0]
- uxtl v5.8h, v5.8b
- mov x5, v3.d[1]
- ld1 {v6.h}[0], [x4], #2
- ld1 {v6.b}[2], [x4]
- ld1 {v6.h}[2], [x5], #2
- ld1 {v6.b}[6], [x5]
- uxtl v6.8h, v6.8b
- //Now v2, v3 is of no use
- //v2: LT, v3: RT, v5: LB, v6:BT
- uxtl v2.4s, v5.4h // c00
- uxtl2 v3.4s, v5.8h // c01
- ucvtf v2.4s, v2.4s
- uxtl v5.4s, v6.4h // c11
- ucvtf v3.4s, v3.4s
- uxtl2 v6.4s, v6.8h // c10
- ucvtf v5.4s, v5.4s
- ucvtf v6.4s, v6.4s
- fsub v3.4s, v3.4s, v2.4s
- fsub v5.4s, v5.4s, v6.4s
- fmla v2.4s, v3.4s, v4.s[0] // (c01-c00)*xF+c00
- fmla v6.4s, v5.4s, v4.s[0] // (c11-c10)*xF+c10
- fsub v6.4s, v6.4s, v2.4s
- fmla v2.4s, v6.4s, v4.s[1]
- fcvtzs v2.4s, v2.4s
- uqxtn v2.4h, v2.4s
- uqxtn v2.8b, v2.8h
- fadd v0.2s, v0.2s, v1.2s
- subs x3, x3, #1
- st1 {v2.h}[0], [x1], #2
- st1 {v2.b}[0], [x1], #1
- bne L1Loop
- End:
- ret
- #endif
|