1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_cifunction ff_pix_abs16_neon, export=1
24cabdff1aSopenharmony_ci        // x0           unused
25cabdff1aSopenharmony_ci        // x1           uint8_t *pix1
26cabdff1aSopenharmony_ci        // x2           uint8_t *pix2
27cabdff1aSopenharmony_ci        // x3           ptrdiff_t stride
28cabdff1aSopenharmony_ci        // w4           int h
29cabdff1aSopenharmony_ci        cmp             w4, #4                      // if h < 4, jump to completion section
30cabdff1aSopenharmony_ci        movi            v18.4S, #0                  // clear result accumulator
31cabdff1aSopenharmony_ci        b.lt            2f
32cabdff1aSopenharmony_ci1:
33cabdff1aSopenharmony_ci        ld1             {v0.16b}, [x1], x3          // load pix1
34cabdff1aSopenharmony_ci        ld1             {v4.16b}, [x2], x3          // load pix2
35cabdff1aSopenharmony_ci        ld1             {v1.16b}, [x1], x3          // load pix1
36cabdff1aSopenharmony_ci        ld1             {v5.16b}, [x2], x3          // load pix2
37cabdff1aSopenharmony_ci        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
38cabdff1aSopenharmony_ci        uabdl2          v17.8h, v0.16b, v4.16b
39cabdff1aSopenharmony_ci        ld1             {v2.16b}, [x1], x3          // load pix1
40cabdff1aSopenharmony_ci        ld1             {v6.16b}, [x2], x3          // load pix2
41cabdff1aSopenharmony_ci        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
42cabdff1aSopenharmony_ci        uabal2          v17.8h, v1.16b, v5.16b
43cabdff1aSopenharmony_ci        ld1             {v3.16b}, [x1], x3
44cabdff1aSopenharmony_ci        ld1             {v7.16b}, [x2], x3
45cabdff1aSopenharmony_ci        uabal           v16.8h, v2.8b, v6.8b
46cabdff1aSopenharmony_ci        uabal2          v17.8h, v2.16b, v6.16b
47cabdff1aSopenharmony_ci        sub             w4, w4, #4                  // h -= 4
48cabdff1aSopenharmony_ci        uabal           v16.8h, v3.8b, v7.8b
49cabdff1aSopenharmony_ci        uabal2          v17.8h, v3.16b, v7.16b
50cabdff1aSopenharmony_ci        cmp             w4, #4                      // if h >= 4, loop
51cabdff1aSopenharmony_ci        add             v16.8h, v16.8h, v17.8h
52cabdff1aSopenharmony_ci        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
53cabdff1aSopenharmony_ci        add             d18, d16, d18               // add to the end result register
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci        b.ge            1b
56cabdff1aSopenharmony_ci        cbnz            w4, 2f                      // if iterations remain, jump to completion section
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci        fmov            w0, s18                     // copy result to general purpose register
59cabdff1aSopenharmony_ci        ret
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci2:
62cabdff1aSopenharmony_ci        ld1             {v0.16b}, [x1], x3          // load pix1
63cabdff1aSopenharmony_ci        ld1             {v4.16b}, [x2], x3          // load pix2
64cabdff1aSopenharmony_ci        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
65cabdff1aSopenharmony_ci        uabal2          v16.8h, v0.16b, v4.16b
66cabdff1aSopenharmony_ci        subs            w4, w4, #1                  // h -= 1
67cabdff1aSopenharmony_ci        addv            h16, v16.8h                 // add up v16
68cabdff1aSopenharmony_ci        add             d18, d16, d18               // add to result
69cabdff1aSopenharmony_ci        b.ne            2b
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci        fmov            w0, s18                     // copy result to general purpose register
72cabdff1aSopenharmony_ci        ret
73cabdff1aSopenharmony_ciendfunc
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_cifunction ff_pix_abs16_xy2_neon, export=1
76cabdff1aSopenharmony_ci        // x0           unused
77cabdff1aSopenharmony_ci        // x1           uint8_t *pix1
78cabdff1aSopenharmony_ci        // x2           uint8_t *pix2
79cabdff1aSopenharmony_ci        // x3           ptrdiff_t stride
80cabdff1aSopenharmony_ci        // w4           int h
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci        add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
83cabdff1aSopenharmony_ci        movi            v0.2d, #0                   // initialize the result register
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci        // Load initial pix2 values for either the unrolled version or completion version.
86cabdff1aSopenharmony_ci        ldur            q4, [x2, #1]                // load pix2+1
87cabdff1aSopenharmony_ci        ldr             q3, [x2]                    // load pix2
88cabdff1aSopenharmony_ci        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
89cabdff1aSopenharmony_ci        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
90cabdff1aSopenharmony_ci        cmp             w4, #4                      // if h < 4 jump to the completion version
91cabdff1aSopenharmony_ci        b.lt            2f
92cabdff1aSopenharmony_ci1:
93cabdff1aSopenharmony_ci        // This is an unrolled implementation. It completes 4 iterations of the C for each branch.
94cabdff1aSopenharmony_ci        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
95cabdff1aSopenharmony_ci        // plus two at the beginning to start.
96cabdff1aSopenharmony_ci        ldur            q5, [x5, #1]                // load pix3+1
97cabdff1aSopenharmony_ci        ld1             {v4.16b}, [x5], x3          // load pix3
98cabdff1aSopenharmony_ci        ld1             {v1.16b}, [x1], x3          // load pix1
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci        ldur            q7, [x5, #1]                // load pix3+1
101cabdff1aSopenharmony_ci        ld1             {v6.16b}, [x5], x3          // load pix3
102cabdff1aSopenharmony_ci        ld1             {v16.16b}, [x1], x3         // load pix1
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci        ldur            q19, [x5, #1]               // load pix3+1
105cabdff1aSopenharmony_ci        ld1             {v18.16b}, [x5], x3         // load pix3
106cabdff1aSopenharmony_ci        ld1             {v17.16b}, [x1], x3         // load pix1
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci        ldur            q22, [x5, #1]               // load pix3+1
109cabdff1aSopenharmony_ci        ld1             {v21.16b}, [x5], x3         // load pix3
110cabdff1aSopenharmony_ci        ld1             {v20.16b}, [x1], x3         // load pix1
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
113cabdff1aSopenharmony_ci        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
114cabdff1aSopenharmony_ci        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
115cabdff1aSopenharmony_ci        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
116cabdff1aSopenharmony_ci        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
117cabdff1aSopenharmony_ci        rshrn           v23.8b, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
118cabdff1aSopenharmony_ci        rshrn2          v23.16b, v24.8h, #2         // shift right 2 8..15
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
121cabdff1aSopenharmony_ci        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
122cabdff1aSopenharmony_ci        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
123cabdff1aSopenharmony_ci        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
124cabdff1aSopenharmony_ci        rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
125cabdff1aSopenharmony_ci        rshrn2          v26.16b, v27.8h, #2         // shift right 2 8..15
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
128cabdff1aSopenharmony_ci        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
129cabdff1aSopenharmony_ci        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
130cabdff1aSopenharmony_ci        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
131cabdff1aSopenharmony_ci        rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
132cabdff1aSopenharmony_ci        rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_ci        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
135cabdff1aSopenharmony_ci        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
136cabdff1aSopenharmony_ci        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
137cabdff1aSopenharmony_ci        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
138cabdff1aSopenharmony_ci        rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
139cabdff1aSopenharmony_ci        rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15
140cabdff1aSopenharmony_ci
141cabdff1aSopenharmony_ci        // Averages are now stored in these registers:
142cabdff1aSopenharmony_ci        // v23, v16, v28, v30
143cabdff1aSopenharmony_ci        // pix1 values in these registers:
144cabdff1aSopenharmony_ci        // v1, v16, v17, v20
145cabdff1aSopenharmony_ci        // available:
146cabdff1aSopenharmony_ci        // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci        sub             w4, w4, #4                  // h -= 4
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci        // Using absolute-difference instructions instead of absolute-difference-accumulate allows
151cabdff1aSopenharmony_ci        // us to keep the results in 16b vectors instead of widening values with twice the instructions.
152cabdff1aSopenharmony_ci        // This approach also has fewer data dependencies, allowing better instruction level parallelism.
153cabdff1aSopenharmony_ci        uabd            v4.16b, v1.16b, v23.16b     // absolute difference 0..15, i=0
154cabdff1aSopenharmony_ci        uabd            v5.16b, v16.16b, v26.16b    // absolute difference 0..15, i=1
155cabdff1aSopenharmony_ci        uabd            v6.16b, v17.16b, v28.16b    // absolute difference 0..15, i=2
156cabdff1aSopenharmony_ci        uabd            v7.16b, v20.16b, v30.16b    // absolute difference 0..15, i=3
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci        cmp             w4, #4                      // loop if h >= 4
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci        // Now add up all the values in each vector, v4-v7 with widening adds
161cabdff1aSopenharmony_ci        uaddl           v19.8h, v4.8b, v5.8b
162cabdff1aSopenharmony_ci        uaddl2          v18.8h, v4.16b, v5.16b
163cabdff1aSopenharmony_ci        uaddl           v4.8h, v6.8b, v7.8b
164cabdff1aSopenharmony_ci        uaddl2          v5.8h, v6.16b, v7.16b
165cabdff1aSopenharmony_ci        add             v4.8h, v4.8h, v5.8h
166cabdff1aSopenharmony_ci        add             v4.8h, v4.8h, v18.8h
167cabdff1aSopenharmony_ci        add             v4.8h, v4.8h, v19.8h
168cabdff1aSopenharmony_ci        uaddlv          s4, v4.8h                   // finish adding up accumulated values
169cabdff1aSopenharmony_ci        add             d0, d0, d4                  // add the value to the top level accumulator
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci        b.ge            1b
172cabdff1aSopenharmony_ci        cbnz            w4, 2f                      // if iterations remain jump to completion section
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci        fmov            w0, s0                      // copy result to general purpose register
175cabdff1aSopenharmony_ci        ret
176cabdff1aSopenharmony_ci2:
177cabdff1aSopenharmony_ci        // v2 and v3 are set either at the end of this loop or at from the unrolled version
178cabdff1aSopenharmony_ci        // which branches here to complete iterations when h % 4 != 0.
179cabdff1aSopenharmony_ci        ldur            q5, [x5, #1]                // load pix3+1
180cabdff1aSopenharmony_ci        ld1             {v4.16b}, [x5], x3          // load pix3
181cabdff1aSopenharmony_ci        ld1             {v1.16b}, [x1], x3          // load pix1
182cabdff1aSopenharmony_ci        subs            w4, w4, #1                  // decrement h
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
185cabdff1aSopenharmony_ci        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
186cabdff1aSopenharmony_ci        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
187cabdff1aSopenharmony_ci        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
188cabdff1aSopenharmony_ci        // divide by 4 to compute the average of values summed above
189cabdff1aSopenharmony_ci        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
190cabdff1aSopenharmony_ci        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
193cabdff1aSopenharmony_ci        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
196cabdff1aSopenharmony_ci        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
197cabdff1aSopenharmony_ci        mov             v2.16b, v18.16b             // pix3 -> pix2
198cabdff1aSopenharmony_ci        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
199cabdff1aSopenharmony_ci        uaddlv          s6, v6.8h                   // add up accumulator in v6
200cabdff1aSopenharmony_ci        add             d0, d0, d6                  // add to the final result
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci        b.ne            2b                          // loop if h > 0
203cabdff1aSopenharmony_ci        fmov            w0, s0                      // copy result to general purpose register
204cabdff1aSopenharmony_ci        ret
205cabdff1aSopenharmony_ciendfunc
206