1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_cifunction ff_pix_abs16_neon, export=1 24cabdff1aSopenharmony_ci // x0 unused 25cabdff1aSopenharmony_ci // x1 uint8_t *pix1 26cabdff1aSopenharmony_ci // x2 uint8_t *pix2 27cabdff1aSopenharmony_ci // x3 ptrdiff_t stride 28cabdff1aSopenharmony_ci // w4 int h 29cabdff1aSopenharmony_ci cmp w4, #4 // if h < 4, jump to completion section 30cabdff1aSopenharmony_ci movi v18.4S, #0 // clear result accumulator 31cabdff1aSopenharmony_ci b.lt 2f 32cabdff1aSopenharmony_ci1: 33cabdff1aSopenharmony_ci ld1 {v0.16b}, [x1], x3 // load pix1 34cabdff1aSopenharmony_ci ld1 {v4.16b}, [x2], x3 // load pix2 35cabdff1aSopenharmony_ci ld1 {v1.16b}, [x1], x3 // load pix1 36cabdff1aSopenharmony_ci ld1 {v5.16b}, [x2], x3 // load pix2 37cabdff1aSopenharmony_ci uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate 38cabdff1aSopenharmony_ci uabdl2 v17.8h, v0.16b, v4.16b 39cabdff1aSopenharmony_ci ld1 {v2.16b}, [x1], x3 // load pix1 40cabdff1aSopenharmony_ci ld1 {v6.16b}, [x2], x3 // load pix2 41cabdff1aSopenharmony_ci uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate 42cabdff1aSopenharmony_ci uabal2 v17.8h, v1.16b, v5.16b 43cabdff1aSopenharmony_ci ld1 {v3.16b}, [x1], x3 44cabdff1aSopenharmony_ci ld1 {v7.16b}, [x2], x3 45cabdff1aSopenharmony_ci uabal v16.8h, v2.8b, v6.8b 46cabdff1aSopenharmony_ci uabal2 v17.8h, v2.16b, v6.16b 47cabdff1aSopenharmony_ci sub w4, w4, #4 // h -= 4 48cabdff1aSopenharmony_ci uabal v16.8h, v3.8b, v7.8b 49cabdff1aSopenharmony_ci uabal2 v17.8h, v3.16b, v7.16b 50cabdff1aSopenharmony_ci cmp w4, #4 // if h >= 4, loop 51cabdff1aSopenharmony_ci add v16.8h, v16.8h, v17.8h 52cabdff1aSopenharmony_ci uaddlv s16, v16.8h // add up everything in v16 accumulator 53cabdff1aSopenharmony_ci add d18, d16, d18 // add to the end result register 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci b.ge 1b 56cabdff1aSopenharmony_ci cbnz w4, 2f // if iterations remain, jump to completion section 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci fmov w0, s18 // copy result to general purpose register 59cabdff1aSopenharmony_ci ret 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci2: 62cabdff1aSopenharmony_ci ld1 {v0.16b}, [x1], x3 // load pix1 63cabdff1aSopenharmony_ci ld1 {v4.16b}, [x2], x3 // load pix2 64cabdff1aSopenharmony_ci uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate 65cabdff1aSopenharmony_ci uabal2 v16.8h, v0.16b, v4.16b 66cabdff1aSopenharmony_ci subs w4, w4, #1 // h -= 1 67cabdff1aSopenharmony_ci addv h16, v16.8h // add up v16 68cabdff1aSopenharmony_ci add d18, d16, d18 // add to result 69cabdff1aSopenharmony_ci b.ne 2b 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci fmov w0, s18 // copy result to general purpose register 72cabdff1aSopenharmony_ci ret 73cabdff1aSopenharmony_ciendfunc 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_cifunction ff_pix_abs16_xy2_neon, export=1 76cabdff1aSopenharmony_ci // x0 unused 77cabdff1aSopenharmony_ci // x1 uint8_t *pix1 78cabdff1aSopenharmony_ci // x2 uint8_t *pix2 79cabdff1aSopenharmony_ci // x3 ptrdiff_t stride 80cabdff1aSopenharmony_ci // w4 int h 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci add x5, x2, x3 // use x5 to hold uint8_t *pix3 83cabdff1aSopenharmony_ci movi v0.2d, #0 // initialize the result register 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci // Load initial pix2 values for either the unrolled version or completion version. 86cabdff1aSopenharmony_ci ldur q4, [x2, #1] // load pix2+1 87cabdff1aSopenharmony_ci ldr q3, [x2] // load pix2 88cabdff1aSopenharmony_ci uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7 89cabdff1aSopenharmony_ci uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15 90cabdff1aSopenharmony_ci cmp w4, #4 // if h < 4 jump to the completion version 91cabdff1aSopenharmony_ci b.lt 2f 92cabdff1aSopenharmony_ci1: 93cabdff1aSopenharmony_ci // This is an unrolled implementation. It completes 4 iterations of the C for each branch. 94cabdff1aSopenharmony_ci // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration, 95cabdff1aSopenharmony_ci // plus two at the beginning to start. 96cabdff1aSopenharmony_ci ldur q5, [x5, #1] // load pix3+1 97cabdff1aSopenharmony_ci ld1 {v4.16b}, [x5], x3 // load pix3 98cabdff1aSopenharmony_ci ld1 {v1.16b}, [x1], x3 // load pix1 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci ldur q7, [x5, #1] // load pix3+1 101cabdff1aSopenharmony_ci ld1 {v6.16b}, [x5], x3 // load pix3 102cabdff1aSopenharmony_ci ld1 {v16.16b}, [x1], x3 // load pix1 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci ldur q19, [x5, #1] // load pix3+1 105cabdff1aSopenharmony_ci ld1 {v18.16b}, [x5], x3 // load pix3 106cabdff1aSopenharmony_ci ld1 {v17.16b}, [x1], x3 // load pix1 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci ldur q22, [x5, #1] // load pix3+1 109cabdff1aSopenharmony_ci ld1 {v21.16b}, [x5], x3 // load pix3 110cabdff1aSopenharmony_ci ld1 {v20.16b}, [x1], x3 // load pix1 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1]) 113cabdff1aSopenharmony_ci uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7 114cabdff1aSopenharmony_ci uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15 115cabdff1aSopenharmony_ci add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration 116cabdff1aSopenharmony_ci add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration 117cabdff1aSopenharmony_ci rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right) 118cabdff1aSopenharmony_ci rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 121cabdff1aSopenharmony_ci uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 122cabdff1aSopenharmony_ci add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above 123cabdff1aSopenharmony_ci add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above 124cabdff1aSopenharmony_ci rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right) 125cabdff1aSopenharmony_ci rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7 128cabdff1aSopenharmony_ci uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15 129cabdff1aSopenharmony_ci add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above 130cabdff1aSopenharmony_ci add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above 131cabdff1aSopenharmony_ci rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right) 132cabdff1aSopenharmony_ci rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7 135cabdff1aSopenharmony_ci uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15 136cabdff1aSopenharmony_ci add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above 137cabdff1aSopenharmony_ci add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above 138cabdff1aSopenharmony_ci rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right) 139cabdff1aSopenharmony_ci rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_ci // Averages are now stored in these registers: 142cabdff1aSopenharmony_ci // v23, v16, v28, v30 143cabdff1aSopenharmony_ci // pix1 values in these registers: 144cabdff1aSopenharmony_ci // v1, v16, v17, v20 145cabdff1aSopenharmony_ci // available: 146cabdff1aSopenharmony_ci // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ci sub w4, w4, #4 // h -= 4 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci // Using absolute-difference instructions instead of absolute-difference-accumulate allows 151cabdff1aSopenharmony_ci // us to keep the results in 16b vectors instead of widening values with twice the instructions. 152cabdff1aSopenharmony_ci // This approach also has fewer data dependencies, allowing better instruction level parallelism. 153cabdff1aSopenharmony_ci uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0 154cabdff1aSopenharmony_ci uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1 155cabdff1aSopenharmony_ci uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2 156cabdff1aSopenharmony_ci uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci cmp w4, #4 // loop if h >= 4 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci // Now add up all the values in each vector, v4-v7 with widening adds 161cabdff1aSopenharmony_ci uaddl v19.8h, v4.8b, v5.8b 162cabdff1aSopenharmony_ci uaddl2 v18.8h, v4.16b, v5.16b 163cabdff1aSopenharmony_ci uaddl v4.8h, v6.8b, v7.8b 164cabdff1aSopenharmony_ci uaddl2 v5.8h, v6.16b, v7.16b 165cabdff1aSopenharmony_ci add v4.8h, v4.8h, v5.8h 166cabdff1aSopenharmony_ci add v4.8h, v4.8h, v18.8h 167cabdff1aSopenharmony_ci add v4.8h, v4.8h, v19.8h 168cabdff1aSopenharmony_ci uaddlv s4, v4.8h // finish adding up accumulated values 169cabdff1aSopenharmony_ci add d0, d0, d4 // add the value to the top level accumulator 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci b.ge 1b 172cabdff1aSopenharmony_ci cbnz w4, 2f // if iterations remain jump to completion section 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ci fmov w0, s0 // copy result to general purpose register 175cabdff1aSopenharmony_ci ret 176cabdff1aSopenharmony_ci2: 177cabdff1aSopenharmony_ci // v2 and v3 are set either at the end of this loop or at from the unrolled version 178cabdff1aSopenharmony_ci // which branches here to complete iterations when h % 4 != 0. 179cabdff1aSopenharmony_ci ldur q5, [x5, #1] // load pix3+1 180cabdff1aSopenharmony_ci ld1 {v4.16b}, [x5], x3 // load pix3 181cabdff1aSopenharmony_ci ld1 {v1.16b}, [x1], x3 // load pix1 182cabdff1aSopenharmony_ci subs w4, w4, #1 // decrement h 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7 185cabdff1aSopenharmony_ci uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15 186cabdff1aSopenharmony_ci add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration 187cabdff1aSopenharmony_ci add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration 188cabdff1aSopenharmony_ci // divide by 4 to compute the average of values summed above 189cabdff1aSopenharmony_ci urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) 190cabdff1aSopenharmony_ci urshr v17.8h, v17.8h, #2 // shift right by 2 8..15 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ci uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15 193cabdff1aSopenharmony_ci uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7 196cabdff1aSopenharmony_ci uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15 197cabdff1aSopenharmony_ci mov v2.16b, v18.16b // pix3 -> pix2 198cabdff1aSopenharmony_ci mov v3.16b, v19.16b // pix3+1 -> pix2+1 199cabdff1aSopenharmony_ci uaddlv s6, v6.8h // add up accumulator in v6 200cabdff1aSopenharmony_ci add d0, d0, d6 // add to the final result 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci b.ne 2b // loop if h > 0 203cabdff1aSopenharmony_ci fmov w0, s0 // copy result to general purpose register 204cabdff1aSopenharmony_ci ret 205cabdff1aSopenharmony_ciendfunc 206