1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci@ All public functions in this file have the following signature: 24cabdff1aSopenharmony_ci@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25cabdff1aSopenharmony_ci@ const uint8_t *ref, ptrdiff_t ref_stride, 26cabdff1aSopenharmony_ci@ int h, int mx, int my); 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cifunction ff_vp9_copy64_neon, export=1 29cabdff1aSopenharmony_ci ldr r12, [sp] 30cabdff1aSopenharmony_ci sub r1, r1, #32 31cabdff1aSopenharmony_ci sub r3, r3, #32 32cabdff1aSopenharmony_ci1: 33cabdff1aSopenharmony_ci vld1.8 {q0, q1}, [r2]! 34cabdff1aSopenharmony_ci vst1.8 {q0, q1}, [r0, :128]! 35cabdff1aSopenharmony_ci vld1.8 {q2, q3}, [r2], r3 36cabdff1aSopenharmony_ci subs r12, r12, #1 37cabdff1aSopenharmony_ci vst1.8 {q2, q3}, [r0, :128], r1 38cabdff1aSopenharmony_ci bne 1b 39cabdff1aSopenharmony_ci bx lr 40cabdff1aSopenharmony_ciendfunc 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_cifunction ff_vp9_avg64_neon, export=1 43cabdff1aSopenharmony_ci push {lr} 44cabdff1aSopenharmony_ci ldr r12, [sp, #4] 45cabdff1aSopenharmony_ci sub r1, r1, #32 46cabdff1aSopenharmony_ci sub r3, r3, #32 47cabdff1aSopenharmony_ci mov lr, r0 48cabdff1aSopenharmony_ci1: 49cabdff1aSopenharmony_ci vld1.8 {q8, q9}, [r2]! 50cabdff1aSopenharmony_ci vld1.8 {q0, q1}, [r0, :128]! 51cabdff1aSopenharmony_ci vld1.8 {q10, q11}, [r2], r3 52cabdff1aSopenharmony_ci vrhadd.u8 q0, q0, q8 53cabdff1aSopenharmony_ci vld1.8 {q2, q3}, [r0, :128], r1 54cabdff1aSopenharmony_ci vrhadd.u8 q1, q1, q9 55cabdff1aSopenharmony_ci vrhadd.u8 q2, q2, q10 56cabdff1aSopenharmony_ci vst1.8 {q0, q1}, [lr, :128]! 57cabdff1aSopenharmony_ci vrhadd.u8 q3, q3, q11 58cabdff1aSopenharmony_ci vst1.8 {q2, q3}, [lr, :128], r1 59cabdff1aSopenharmony_ci subs r12, r12, #1 60cabdff1aSopenharmony_ci bne 1b 61cabdff1aSopenharmony_ci pop {pc} 62cabdff1aSopenharmony_ciendfunc 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_cifunction ff_vp9_copy32_neon, export=1 65cabdff1aSopenharmony_ci ldr r12, [sp] 66cabdff1aSopenharmony_ci1: 67cabdff1aSopenharmony_ci vld1.8 {q0, q1}, [r2], r3 68cabdff1aSopenharmony_ci subs r12, r12, #1 69cabdff1aSopenharmony_ci vst1.8 {q0, q1}, [r0, :128], r1 70cabdff1aSopenharmony_ci bne 1b 71cabdff1aSopenharmony_ci bx lr 72cabdff1aSopenharmony_ciendfunc 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_cifunction ff_vp9_avg32_neon, export=1 75cabdff1aSopenharmony_ci ldr r12, [sp] 76cabdff1aSopenharmony_ci1: 77cabdff1aSopenharmony_ci vld1.8 {q2, q3}, [r2], r3 78cabdff1aSopenharmony_ci vld1.8 {q0, q1}, [r0, :128] 79cabdff1aSopenharmony_ci vrhadd.u8 q0, q0, q2 80cabdff1aSopenharmony_ci vrhadd.u8 q1, q1, q3 81cabdff1aSopenharmony_ci subs r12, r12, #1 82cabdff1aSopenharmony_ci vst1.8 {q0, q1}, [r0, :128], r1 83cabdff1aSopenharmony_ci bne 1b 84cabdff1aSopenharmony_ci bx lr 85cabdff1aSopenharmony_ciendfunc 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_cifunction ff_vp9_copy16_neon, export=1 88cabdff1aSopenharmony_ci push {r4,lr} 89cabdff1aSopenharmony_ci ldr r12, [sp, #8] 90cabdff1aSopenharmony_ci add r4, r0, r1 91cabdff1aSopenharmony_ci add lr, r2, r3 92cabdff1aSopenharmony_ci add r1, r1, r1 93cabdff1aSopenharmony_ci add r3, r3, r3 94cabdff1aSopenharmony_ci1: 95cabdff1aSopenharmony_ci vld1.8 {q0}, [r2], r3 96cabdff1aSopenharmony_ci vld1.8 {q1}, [lr], r3 97cabdff1aSopenharmony_ci subs r12, r12, #2 98cabdff1aSopenharmony_ci vst1.8 {q0}, [r0, :128], r1 99cabdff1aSopenharmony_ci vst1.8 {q1}, [r4, :128], r1 100cabdff1aSopenharmony_ci bne 1b 101cabdff1aSopenharmony_ci pop {r4,pc} 102cabdff1aSopenharmony_ciendfunc 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_cifunction ff_vp9_avg16_neon, export=1 105cabdff1aSopenharmony_ci push {lr} 106cabdff1aSopenharmony_ci ldr r12, [sp, #4] 107cabdff1aSopenharmony_ci mov lr, r0 108cabdff1aSopenharmony_ci1: 109cabdff1aSopenharmony_ci vld1.8 {q2}, [r2], r3 110cabdff1aSopenharmony_ci vld1.8 {q0}, [r0, :128], r1 111cabdff1aSopenharmony_ci vld1.8 {q3}, [r2], r3 112cabdff1aSopenharmony_ci vrhadd.u8 q0, q0, q2 113cabdff1aSopenharmony_ci vld1.8 {q1}, [r0, :128], r1 114cabdff1aSopenharmony_ci vrhadd.u8 q1, q1, q3 115cabdff1aSopenharmony_ci subs r12, r12, #2 116cabdff1aSopenharmony_ci vst1.8 {q0}, [lr, :128], r1 117cabdff1aSopenharmony_ci vst1.8 {q1}, [lr, :128], r1 118cabdff1aSopenharmony_ci bne 1b 119cabdff1aSopenharmony_ci pop {pc} 120cabdff1aSopenharmony_ciendfunc 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_cifunction ff_vp9_copy8_neon, export=1 123cabdff1aSopenharmony_ci ldr r12, [sp] 124cabdff1aSopenharmony_ci1: 125cabdff1aSopenharmony_ci vld1.8 {d0}, [r2], r3 126cabdff1aSopenharmony_ci vld1.8 {d1}, [r2], r3 127cabdff1aSopenharmony_ci subs r12, r12, #2 128cabdff1aSopenharmony_ci vst1.8 {d0}, [r0, :64], r1 129cabdff1aSopenharmony_ci vst1.8 {d1}, [r0, :64], r1 130cabdff1aSopenharmony_ci bne 1b 131cabdff1aSopenharmony_ci bx lr 132cabdff1aSopenharmony_ciendfunc 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_cifunction ff_vp9_avg8_neon, export=1 135cabdff1aSopenharmony_ci ldr r12, [sp] 136cabdff1aSopenharmony_ci1: 137cabdff1aSopenharmony_ci vld1.8 {d2}, [r2], r3 138cabdff1aSopenharmony_ci vld1.8 {d0}, [r0, :64], r1 139cabdff1aSopenharmony_ci vld1.8 {d3}, [r2], r3 140cabdff1aSopenharmony_ci vrhadd.u8 d0, d0, d2 141cabdff1aSopenharmony_ci vld1.8 {d1}, [r0, :64] 142cabdff1aSopenharmony_ci sub r0, r0, r1 143cabdff1aSopenharmony_ci vrhadd.u8 d1, d1, d3 144cabdff1aSopenharmony_ci subs r12, r12, #2 145cabdff1aSopenharmony_ci vst1.8 {d0}, [r0, :64], r1 146cabdff1aSopenharmony_ci vst1.8 {d1}, [r0, :64], r1 147cabdff1aSopenharmony_ci bne 1b 148cabdff1aSopenharmony_ci bx lr 149cabdff1aSopenharmony_ciendfunc 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_cifunction ff_vp9_copy4_neon, export=1 152cabdff1aSopenharmony_ci ldr r12, [sp] 153cabdff1aSopenharmony_ci1: 154cabdff1aSopenharmony_ci vld1.32 {d0[]}, [r2], r3 155cabdff1aSopenharmony_ci vld1.32 {d1[]}, [r2], r3 156cabdff1aSopenharmony_ci vst1.32 {d0[0]}, [r0, :32], r1 157cabdff1aSopenharmony_ci vld1.32 {d2[]}, [r2], r3 158cabdff1aSopenharmony_ci vst1.32 {d1[0]}, [r0, :32], r1 159cabdff1aSopenharmony_ci vld1.32 {d3[]}, [r2], r3 160cabdff1aSopenharmony_ci subs r12, r12, #4 161cabdff1aSopenharmony_ci vst1.32 {d2[0]}, [r0, :32], r1 162cabdff1aSopenharmony_ci vst1.32 {d3[0]}, [r0, :32], r1 163cabdff1aSopenharmony_ci bne 1b 164cabdff1aSopenharmony_ci bx lr 165cabdff1aSopenharmony_ciendfunc 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_cifunction ff_vp9_avg4_neon, export=1 168cabdff1aSopenharmony_ci push {lr} 169cabdff1aSopenharmony_ci ldr r12, [sp, #4] 170cabdff1aSopenharmony_ci mov lr, r0 171cabdff1aSopenharmony_ci1: 172cabdff1aSopenharmony_ci vld1.32 {d4[]}, [r2], r3 173cabdff1aSopenharmony_ci vld1.32 {d0[]}, [r0, :32], r1 174cabdff1aSopenharmony_ci vld1.32 {d5[]}, [r2], r3 175cabdff1aSopenharmony_ci vrhadd.u8 d0, d0, d4 176cabdff1aSopenharmony_ci vld1.32 {d1[]}, [r0, :32], r1 177cabdff1aSopenharmony_ci vld1.32 {d6[]}, [r2], r3 178cabdff1aSopenharmony_ci vrhadd.u8 d1, d1, d5 179cabdff1aSopenharmony_ci vld1.32 {d2[]}, [r0, :32], r1 180cabdff1aSopenharmony_ci vld1.32 {d7[]}, [r2], r3 181cabdff1aSopenharmony_ci vrhadd.u8 d2, d2, d6 182cabdff1aSopenharmony_ci vld1.32 {d3[]}, [r0, :32], r1 183cabdff1aSopenharmony_ci subs r12, r12, #4 184cabdff1aSopenharmony_ci vst1.32 {d0[0]}, [lr, :32], r1 185cabdff1aSopenharmony_ci vrhadd.u8 d3, d3, d7 186cabdff1aSopenharmony_ci vst1.32 {d1[0]}, [lr, :32], r1 187cabdff1aSopenharmony_ci vst1.32 {d2[0]}, [lr, :32], r1 188cabdff1aSopenharmony_ci vst1.32 {d3[0]}, [lr, :32], r1 189cabdff1aSopenharmony_ci bne 1b 190cabdff1aSopenharmony_ci pop {pc} 191cabdff1aSopenharmony_ciendfunc 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci@ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index 194cabdff1aSopenharmony_ci.macro vmul_lane dst, src, idx 195cabdff1aSopenharmony_ci.if \idx < 4 196cabdff1aSopenharmony_ci vmul.s16 \dst, \src, d0[\idx] 197cabdff1aSopenharmony_ci.else 198cabdff1aSopenharmony_ci vmul.s16 \dst, \src, d1[\idx - 4] 199cabdff1aSopenharmony_ci.endif 200cabdff1aSopenharmony_ci.endm 201cabdff1aSopenharmony_ci.macro vmla_lane dst, src, idx 202cabdff1aSopenharmony_ci.if \idx < 4 203cabdff1aSopenharmony_ci vmla.s16 \dst, \src, d0[\idx] 204cabdff1aSopenharmony_ci.else 205cabdff1aSopenharmony_ci vmla.s16 \dst, \src, d1[\idx - 4] 206cabdff1aSopenharmony_ci.endif 207cabdff1aSopenharmony_ci.endm 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci@ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 210cabdff1aSopenharmony_ci@ for size >= 16), and multiply-accumulate into dst1 and dst3 (or 211cabdff1aSopenharmony_ci@ dst1-dst2 and dst3-dst4 for size >= 16) 212cabdff1aSopenharmony_ci.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size 213cabdff1aSopenharmony_ci vext.8 q14, \src1, \src2, #(2*\offset) 214cabdff1aSopenharmony_ci vext.8 q15, \src4, \src5, #(2*\offset) 215cabdff1aSopenharmony_ci.if \size >= 16 216cabdff1aSopenharmony_ci vmla_lane \dst1, q14, \offset 217cabdff1aSopenharmony_ci vext.8 q5, \src2, \src3, #(2*\offset) 218cabdff1aSopenharmony_ci vmla_lane \dst3, q15, \offset 219cabdff1aSopenharmony_ci vext.8 q6, \src5, \src6, #(2*\offset) 220cabdff1aSopenharmony_ci vmla_lane \dst2, q5, \offset 221cabdff1aSopenharmony_ci vmla_lane \dst4, q6, \offset 222cabdff1aSopenharmony_ci.elseif \size == 8 223cabdff1aSopenharmony_ci vmla_lane \dst1, q14, \offset 224cabdff1aSopenharmony_ci vmla_lane \dst3, q15, \offset 225cabdff1aSopenharmony_ci.else 226cabdff1aSopenharmony_ci vmla_lane \dst1d, d28, \offset 227cabdff1aSopenharmony_ci vmla_lane \dst3d, d30, \offset 228cabdff1aSopenharmony_ci.endif 229cabdff1aSopenharmony_ci.endm 230cabdff1aSopenharmony_ci@ The same as above, but don't accumulate straight into the 231cabdff1aSopenharmony_ci@ destination, but use a temp register and accumulate with saturation. 232cabdff1aSopenharmony_ci.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size 233cabdff1aSopenharmony_ci vext.8 q14, \src1, \src2, #(2*\offset) 234cabdff1aSopenharmony_ci vext.8 q15, \src4, \src5, #(2*\offset) 235cabdff1aSopenharmony_ci.if \size >= 16 236cabdff1aSopenharmony_ci vmul_lane q14, q14, \offset 237cabdff1aSopenharmony_ci vext.8 q5, \src2, \src3, #(2*\offset) 238cabdff1aSopenharmony_ci vmul_lane q15, q15, \offset 239cabdff1aSopenharmony_ci vext.8 q6, \src5, \src6, #(2*\offset) 240cabdff1aSopenharmony_ci vmul_lane q5, q5, \offset 241cabdff1aSopenharmony_ci vmul_lane q6, q6, \offset 242cabdff1aSopenharmony_ci.elseif \size == 8 243cabdff1aSopenharmony_ci vmul_lane q14, q14, \offset 244cabdff1aSopenharmony_ci vmul_lane q15, q15, \offset 245cabdff1aSopenharmony_ci.else 246cabdff1aSopenharmony_ci vmul_lane d28, d28, \offset 247cabdff1aSopenharmony_ci vmul_lane d30, d30, \offset 248cabdff1aSopenharmony_ci.endif 249cabdff1aSopenharmony_ci.if \size == 4 250cabdff1aSopenharmony_ci vqadd.s16 \dst1d, \dst1d, d28 251cabdff1aSopenharmony_ci vqadd.s16 \dst3d, \dst3d, d30 252cabdff1aSopenharmony_ci.else 253cabdff1aSopenharmony_ci vqadd.s16 \dst1, \dst1, q14 254cabdff1aSopenharmony_ci vqadd.s16 \dst3, \dst3, q15 255cabdff1aSopenharmony_ci.if \size >= 16 256cabdff1aSopenharmony_ci vqadd.s16 \dst2, \dst2, q5 257cabdff1aSopenharmony_ci vqadd.s16 \dst4, \dst4, q6 258cabdff1aSopenharmony_ci.endif 259cabdff1aSopenharmony_ci.endif 260cabdff1aSopenharmony_ci.endm 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci@ Instantiate a horizontal filter function for the given size. 264cabdff1aSopenharmony_ci@ This can work on 4, 8 or 16 pixels in parallel; for larger 265cabdff1aSopenharmony_ci@ widths it will do 16 pixels at a time and loop horizontally. 266cabdff1aSopenharmony_ci@ The actual width is passed in r5, the height in r4 and 267cabdff1aSopenharmony_ci@ the filter coefficients in r12. idx2 is the index of the largest 268cabdff1aSopenharmony_ci@ filter coefficient (3 or 4) and idx1 is the other one of them. 269cabdff1aSopenharmony_ci.macro do_8tap_h type, size, idx1, idx2 270cabdff1aSopenharmony_cifunction \type\()_8tap_\size\()h_\idx1\idx2 271cabdff1aSopenharmony_ci sub r2, r2, #3 272cabdff1aSopenharmony_ci add r6, r0, r1 273cabdff1aSopenharmony_ci add r7, r2, r3 274cabdff1aSopenharmony_ci add r1, r1, r1 275cabdff1aSopenharmony_ci add r3, r3, r3 276cabdff1aSopenharmony_ci @ Only size >= 16 loops horizontally and needs 277cabdff1aSopenharmony_ci @ reduced dst stride 278cabdff1aSopenharmony_ci.if \size >= 16 279cabdff1aSopenharmony_ci sub r1, r1, r5 280cabdff1aSopenharmony_ci.endif 281cabdff1aSopenharmony_ci @ size >= 16 loads two qwords and increments r2, 282cabdff1aSopenharmony_ci @ for size 4/8 it's enough with one qword and no 283cabdff1aSopenharmony_ci @ postincrement 284cabdff1aSopenharmony_ci.if \size >= 16 285cabdff1aSopenharmony_ci sub r3, r3, r5 286cabdff1aSopenharmony_ci sub r3, r3, #8 287cabdff1aSopenharmony_ci.endif 288cabdff1aSopenharmony_ci @ Load the filter vector 289cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 290cabdff1aSopenharmony_ci1: 291cabdff1aSopenharmony_ci.if \size >= 16 292cabdff1aSopenharmony_ci mov r12, r5 293cabdff1aSopenharmony_ci.endif 294cabdff1aSopenharmony_ci @ Load src 295cabdff1aSopenharmony_ci.if \size >= 16 296cabdff1aSopenharmony_ci vld1.8 {d18, d19, d20}, [r2]! 297cabdff1aSopenharmony_ci vld1.8 {d24, d25, d26}, [r7]! 298cabdff1aSopenharmony_ci.else 299cabdff1aSopenharmony_ci vld1.8 {q9}, [r2] 300cabdff1aSopenharmony_ci vld1.8 {q12}, [r7] 301cabdff1aSopenharmony_ci.endif 302cabdff1aSopenharmony_ci vmovl.u8 q8, d18 303cabdff1aSopenharmony_ci vmovl.u8 q9, d19 304cabdff1aSopenharmony_ci vmovl.u8 q11, d24 305cabdff1aSopenharmony_ci vmovl.u8 q12, d25 306cabdff1aSopenharmony_ci.if \size >= 16 307cabdff1aSopenharmony_ci vmovl.u8 q10, d20 308cabdff1aSopenharmony_ci vmovl.u8 q13, d26 309cabdff1aSopenharmony_ci.endif 310cabdff1aSopenharmony_ci2: 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci @ Accumulate, adding idx2 last with a separate 313cabdff1aSopenharmony_ci @ saturating add. The positive filter coefficients 314cabdff1aSopenharmony_ci @ for all indices except idx2 must add up to less 315cabdff1aSopenharmony_ci @ than 127 for this not to overflow. 316cabdff1aSopenharmony_ci vmul.s16 q1, q8, d0[0] 317cabdff1aSopenharmony_ci vmul.s16 q3, q11, d0[0] 318cabdff1aSopenharmony_ci.if \size >= 16 319cabdff1aSopenharmony_ci vmul.s16 q2, q9, d0[0] 320cabdff1aSopenharmony_ci vmul.s16 q4, q12, d0[0] 321cabdff1aSopenharmony_ci.endif 322cabdff1aSopenharmony_ci extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size 323cabdff1aSopenharmony_ci extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size 324cabdff1aSopenharmony_ci extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size 325cabdff1aSopenharmony_ci extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size 326cabdff1aSopenharmony_ci extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size 327cabdff1aSopenharmony_ci extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size 328cabdff1aSopenharmony_ci extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_ci @ Round, shift and saturate 331cabdff1aSopenharmony_ci vqrshrun.s16 d2, q1, #7 332cabdff1aSopenharmony_ci vqrshrun.s16 d6, q3, #7 333cabdff1aSopenharmony_ci.if \size >= 16 334cabdff1aSopenharmony_ci vqrshrun.s16 d3, q2, #7 335cabdff1aSopenharmony_ci vqrshrun.s16 d7, q4, #7 336cabdff1aSopenharmony_ci.endif 337cabdff1aSopenharmony_ci @ Average 338cabdff1aSopenharmony_ci.ifc \type,avg 339cabdff1aSopenharmony_ci.if \size >= 16 340cabdff1aSopenharmony_ci vld1.8 {q14}, [r0,:128] 341cabdff1aSopenharmony_ci vld1.8 {q15}, [r6,:128] 342cabdff1aSopenharmony_ci vrhadd.u8 q1, q1, q14 343cabdff1aSopenharmony_ci vrhadd.u8 q3, q3, q15 344cabdff1aSopenharmony_ci.elseif \size == 8 345cabdff1aSopenharmony_ci vld1.8 {d28}, [r0,:64] 346cabdff1aSopenharmony_ci vld1.8 {d30}, [r6,:64] 347cabdff1aSopenharmony_ci vrhadd.u8 d2, d2, d28 348cabdff1aSopenharmony_ci vrhadd.u8 d6, d6, d30 349cabdff1aSopenharmony_ci.else 350cabdff1aSopenharmony_ci @ We only need d28[0], but [] is faster on some cores 351cabdff1aSopenharmony_ci vld1.32 {d28[]}, [r0,:32] 352cabdff1aSopenharmony_ci vld1.32 {d30[]}, [r6,:32] 353cabdff1aSopenharmony_ci vrhadd.u8 d2, d2, d28 354cabdff1aSopenharmony_ci vrhadd.u8 d6, d6, d30 355cabdff1aSopenharmony_ci.endif 356cabdff1aSopenharmony_ci.endif 357cabdff1aSopenharmony_ci @ Store and loop horizontally (for size >= 16) 358cabdff1aSopenharmony_ci.if \size >= 16 359cabdff1aSopenharmony_ci subs r12, r12, #16 360cabdff1aSopenharmony_ci vst1.8 {q1}, [r0,:128]! 361cabdff1aSopenharmony_ci vst1.8 {q3}, [r6,:128]! 362cabdff1aSopenharmony_ci beq 3f 363cabdff1aSopenharmony_ci vmov q8, q10 364cabdff1aSopenharmony_ci vmov q11, q13 365cabdff1aSopenharmony_ci vld1.8 {q10}, [r2]! 366cabdff1aSopenharmony_ci vld1.8 {q13}, [r7]! 367cabdff1aSopenharmony_ci vmovl.u8 q9, d20 368cabdff1aSopenharmony_ci vmovl.u8 q10, d21 369cabdff1aSopenharmony_ci vmovl.u8 q12, d26 370cabdff1aSopenharmony_ci vmovl.u8 q13, d27 371cabdff1aSopenharmony_ci b 2b 372cabdff1aSopenharmony_ci.elseif \size == 8 373cabdff1aSopenharmony_ci vst1.8 {d2}, [r0,:64] 374cabdff1aSopenharmony_ci vst1.8 {d6}, [r6,:64] 375cabdff1aSopenharmony_ci.else @ \size == 4 376cabdff1aSopenharmony_ci vst1.32 {d2[0]}, [r0,:32] 377cabdff1aSopenharmony_ci vst1.32 {d6[0]}, [r6,:32] 378cabdff1aSopenharmony_ci.endif 379cabdff1aSopenharmony_ci3: 380cabdff1aSopenharmony_ci @ Loop vertically 381cabdff1aSopenharmony_ci add r0, r0, r1 382cabdff1aSopenharmony_ci add r6, r6, r1 383cabdff1aSopenharmony_ci add r2, r2, r3 384cabdff1aSopenharmony_ci add r7, r7, r3 385cabdff1aSopenharmony_ci subs r4, r4, #2 386cabdff1aSopenharmony_ci bne 1b 387cabdff1aSopenharmony_ci.if \size >= 16 388cabdff1aSopenharmony_ci vpop {q4-q6} 389cabdff1aSopenharmony_ci.endif 390cabdff1aSopenharmony_ci pop {r4-r7} 391cabdff1aSopenharmony_ci bx lr 392cabdff1aSopenharmony_ciendfunc 393cabdff1aSopenharmony_ci.endm 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci.macro do_8tap_h_size size 396cabdff1aSopenharmony_cido_8tap_h put, \size, 3, 4 397cabdff1aSopenharmony_cido_8tap_h avg, \size, 3, 4 398cabdff1aSopenharmony_cido_8tap_h put, \size, 4, 3 399cabdff1aSopenharmony_cido_8tap_h avg, \size, 4, 3 400cabdff1aSopenharmony_ci.endm 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_cido_8tap_h_size 4 403cabdff1aSopenharmony_cido_8tap_h_size 8 404cabdff1aSopenharmony_cido_8tap_h_size 16 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci.macro do_8tap_h_func type, filter, offset, size 407cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 408cabdff1aSopenharmony_ci push {r4-r7} 409cabdff1aSopenharmony_ci.if \size >= 16 410cabdff1aSopenharmony_ci vpush {q4-q6} 411cabdff1aSopenharmony_ci ldr r4, [sp, #64] 412cabdff1aSopenharmony_ci ldr r5, [sp, #68] 413cabdff1aSopenharmony_ci.else 414cabdff1aSopenharmony_ci ldr r4, [sp, #16] 415cabdff1aSopenharmony_ci ldr r5, [sp, #20] 416cabdff1aSopenharmony_ci.endif 417cabdff1aSopenharmony_ci movrelx r12, X(ff_vp9_subpel_filters), r6 418cabdff1aSopenharmony_ci add r12, r12, 256*\offset 419cabdff1aSopenharmony_ci cmp r5, #8 420cabdff1aSopenharmony_ci add r12, r12, r5, lsl #4 421cabdff1aSopenharmony_ci mov r5, #\size 422cabdff1aSopenharmony_ci.if \size >= 16 423cabdff1aSopenharmony_ci bge \type\()_8tap_16h_34 424cabdff1aSopenharmony_ci b \type\()_8tap_16h_43 425cabdff1aSopenharmony_ci.else 426cabdff1aSopenharmony_ci bge \type\()_8tap_\size\()h_34 427cabdff1aSopenharmony_ci b \type\()_8tap_\size\()h_43 428cabdff1aSopenharmony_ci.endif 429cabdff1aSopenharmony_ciendfunc 430cabdff1aSopenharmony_ci.endm 431cabdff1aSopenharmony_ci 432cabdff1aSopenharmony_ci.macro do_8tap_h_filters size 433cabdff1aSopenharmony_cido_8tap_h_func put, regular, 1, \size 434cabdff1aSopenharmony_cido_8tap_h_func avg, regular, 1, \size 435cabdff1aSopenharmony_cido_8tap_h_func put, sharp, 2, \size 436cabdff1aSopenharmony_cido_8tap_h_func avg, sharp, 2, \size 437cabdff1aSopenharmony_cido_8tap_h_func put, smooth, 0, \size 438cabdff1aSopenharmony_cido_8tap_h_func avg, smooth, 0, \size 439cabdff1aSopenharmony_ci.endm 440cabdff1aSopenharmony_ci 441cabdff1aSopenharmony_cido_8tap_h_filters 64 442cabdff1aSopenharmony_cido_8tap_h_filters 32 443cabdff1aSopenharmony_cido_8tap_h_filters 16 444cabdff1aSopenharmony_cido_8tap_h_filters 8 445cabdff1aSopenharmony_cido_8tap_h_filters 4 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci.ltorg 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci@ Vertical filters 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci@ Round, shift and saturate and store qreg1-2 over 4 lines 452cabdff1aSopenharmony_ci.macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type 453cabdff1aSopenharmony_ci vqrshrun.s16 \dreg1, \qreg1, #7 454cabdff1aSopenharmony_ci vqrshrun.s16 \dreg2, \qreg2, #7 455cabdff1aSopenharmony_ci.ifc \type,avg 456cabdff1aSopenharmony_ci vld1.32 {\tmp1[]}, [r0,:32], r1 457cabdff1aSopenharmony_ci vld1.32 {\tmp2[]}, [r0,:32], r1 458cabdff1aSopenharmony_ci vld1.32 {\tmp1[1]}, [r0,:32], r1 459cabdff1aSopenharmony_ci vld1.32 {\tmp2[1]}, [r0,:32], r1 460cabdff1aSopenharmony_ci vrhadd.u8 \dreg1, \dreg1, \tmp1 461cabdff1aSopenharmony_ci vrhadd.u8 \dreg2, \dreg2, \tmp2 462cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 463cabdff1aSopenharmony_ci.endif 464cabdff1aSopenharmony_ci vst1.32 {\dreg1[0]}, [r0,:32], r1 465cabdff1aSopenharmony_ci vst1.32 {\dreg2[0]}, [r0,:32], r1 466cabdff1aSopenharmony_ci vst1.32 {\dreg1[1]}, [r0,:32], r1 467cabdff1aSopenharmony_ci vst1.32 {\dreg2[1]}, [r0,:32], r1 468cabdff1aSopenharmony_ci.endm 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci@ Round, shift and saturate and store qreg1-4 471cabdff1aSopenharmony_ci.macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type 472cabdff1aSopenharmony_ci vqrshrun.s16 \dreg1, \qreg1, #7 473cabdff1aSopenharmony_ci vqrshrun.s16 \dreg2, \qreg2, #7 474cabdff1aSopenharmony_ci vqrshrun.s16 \dreg3, \qreg3, #7 475cabdff1aSopenharmony_ci vqrshrun.s16 \dreg4, \qreg4, #7 476cabdff1aSopenharmony_ci.ifc \type,avg 477cabdff1aSopenharmony_ci vld1.8 {\tmp1}, [r0,:64], r1 478cabdff1aSopenharmony_ci vld1.8 {\tmp2}, [r0,:64], r1 479cabdff1aSopenharmony_ci vld1.8 {\tmp3}, [r0,:64], r1 480cabdff1aSopenharmony_ci vld1.8 {\tmp4}, [r0,:64], r1 481cabdff1aSopenharmony_ci vrhadd.u8 \dreg1, \dreg1, \tmp1 482cabdff1aSopenharmony_ci vrhadd.u8 \dreg2, \dreg2, \tmp2 483cabdff1aSopenharmony_ci vrhadd.u8 \dreg3, \dreg3, \tmp3 484cabdff1aSopenharmony_ci vrhadd.u8 \dreg4, \dreg4, \tmp4 485cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 486cabdff1aSopenharmony_ci.endif 487cabdff1aSopenharmony_ci vst1.8 {\dreg1}, [r0,:64], r1 488cabdff1aSopenharmony_ci vst1.8 {\dreg2}, [r0,:64], r1 489cabdff1aSopenharmony_ci vst1.8 {\dreg3}, [r0,:64], r1 490cabdff1aSopenharmony_ci vst1.8 {\dreg4}, [r0,:64], r1 491cabdff1aSopenharmony_ci.endm 492cabdff1aSopenharmony_ci 493cabdff1aSopenharmony_ci@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 494cabdff1aSopenharmony_ci@ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately 495cabdff1aSopenharmony_ci@ at the end with saturation. Indices 0 and 7 always have negative or zero 496cabdff1aSopenharmony_ci@ coefficients, so they can be accumulated into tmp1-tmp2 together with the 497cabdff1aSopenharmony_ci@ largest coefficient. 498cabdff1aSopenharmony_ci.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 499cabdff1aSopenharmony_ci vmul.s16 \dst1, \src2, d0[1] 500cabdff1aSopenharmony_ci vmul.s16 \dst2, \src3, d0[1] 501cabdff1aSopenharmony_ci vmul.s16 \tmp1, \src1, d0[0] 502cabdff1aSopenharmony_ci vmul.s16 \tmp2, \src2, d0[0] 503cabdff1aSopenharmony_ci vmla.s16 \dst1, \src3, d0[2] 504cabdff1aSopenharmony_ci vmla.s16 \dst2, \src4, d0[2] 505cabdff1aSopenharmony_ci.if \idx1 == 3 506cabdff1aSopenharmony_ci vmla.s16 \dst1, \src4, d0[3] 507cabdff1aSopenharmony_ci vmla.s16 \dst2, \src5, d0[3] 508cabdff1aSopenharmony_ci.else 509cabdff1aSopenharmony_ci vmla.s16 \dst1, \src5, d1[0] 510cabdff1aSopenharmony_ci vmla.s16 \dst2, \src6, d1[0] 511cabdff1aSopenharmony_ci.endif 512cabdff1aSopenharmony_ci vmla.s16 \dst1, \src6, d1[1] 513cabdff1aSopenharmony_ci vmla.s16 \dst2, \src7, d1[1] 514cabdff1aSopenharmony_ci vmla.s16 \tmp1, \src8, d1[3] 515cabdff1aSopenharmony_ci vmla.s16 \tmp2, \src9, d1[3] 516cabdff1aSopenharmony_ci vmla.s16 \dst1, \src7, d1[2] 517cabdff1aSopenharmony_ci vmla.s16 \dst2, \src8, d1[2] 518cabdff1aSopenharmony_ci.if \idx2 == 3 519cabdff1aSopenharmony_ci vmla.s16 \tmp1, \src4, d0[3] 520cabdff1aSopenharmony_ci vmla.s16 \tmp2, \src5, d0[3] 521cabdff1aSopenharmony_ci.else 522cabdff1aSopenharmony_ci vmla.s16 \tmp1, \src5, d1[0] 523cabdff1aSopenharmony_ci vmla.s16 \tmp2, \src6, d1[0] 524cabdff1aSopenharmony_ci.endif 525cabdff1aSopenharmony_ci vqadd.s16 \dst1, \dst1, \tmp1 526cabdff1aSopenharmony_ci vqadd.s16 \dst2, \dst2, \tmp2 527cabdff1aSopenharmony_ci.endm 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci@ Load pixels and extend them to 16 bit 530cabdff1aSopenharmony_ci.macro loadl dst1, dst2, dst3, dst4 531cabdff1aSopenharmony_ci vld1.8 {d2}, [r2], r3 532cabdff1aSopenharmony_ci vld1.8 {d3}, [r2], r3 533cabdff1aSopenharmony_ci vld1.8 {d4}, [r2], r3 534cabdff1aSopenharmony_ci.ifnb \dst4 535cabdff1aSopenharmony_ci vld1.8 {d5}, [r2], r3 536cabdff1aSopenharmony_ci.endif 537cabdff1aSopenharmony_ci vmovl.u8 \dst1, d2 538cabdff1aSopenharmony_ci vmovl.u8 \dst2, d3 539cabdff1aSopenharmony_ci vmovl.u8 \dst3, d4 540cabdff1aSopenharmony_ci.ifnb \dst4 541cabdff1aSopenharmony_ci vmovl.u8 \dst4, d5 542cabdff1aSopenharmony_ci.endif 543cabdff1aSopenharmony_ci.endm 544cabdff1aSopenharmony_ci 545cabdff1aSopenharmony_ci@ Instantiate a vertical filter function for filtering 8 pixels at a time. 546cabdff1aSopenharmony_ci@ The height is passed in r4, the width in r5 and the filter coefficients 547cabdff1aSopenharmony_ci@ in r12. idx2 is the index of the largest filter coefficient (3 or 4) 548cabdff1aSopenharmony_ci@ and idx1 is the other one of them. 549cabdff1aSopenharmony_ci.macro do_8tap_8v type, idx1, idx2 550cabdff1aSopenharmony_cifunction \type\()_8tap_8v_\idx1\idx2 551cabdff1aSopenharmony_ci sub r2, r2, r3, lsl #1 552cabdff1aSopenharmony_ci sub r2, r2, r3 553cabdff1aSopenharmony_ci vld1.16 {q0}, [r12, :128] 554cabdff1aSopenharmony_ci1: 555cabdff1aSopenharmony_ci mov r12, r4 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci loadl q5, q6, q7 558cabdff1aSopenharmony_ci loadl q8, q9, q10, q11 559cabdff1aSopenharmony_ci2: 560cabdff1aSopenharmony_ci loadl q12, q13, q14, q15 561cabdff1aSopenharmony_ci convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5 562cabdff1aSopenharmony_ci convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6 563cabdff1aSopenharmony_ci do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type 564cabdff1aSopenharmony_ci 565cabdff1aSopenharmony_ci subs r12, r12, #4 566cabdff1aSopenharmony_ci beq 8f 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci loadl q4, q5, q6, q7 569cabdff1aSopenharmony_ci convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9 570cabdff1aSopenharmony_ci convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10 571cabdff1aSopenharmony_ci do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci subs r12, r12, #4 574cabdff1aSopenharmony_ci beq 8f 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci loadl q8, q9, q10, q11 577cabdff1aSopenharmony_ci convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13 578cabdff1aSopenharmony_ci convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14 579cabdff1aSopenharmony_ci do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci subs r12, r12, #4 582cabdff1aSopenharmony_ci bne 2b 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci8: 585cabdff1aSopenharmony_ci subs r5, r5, #8 586cabdff1aSopenharmony_ci beq 9f 587cabdff1aSopenharmony_ci @ r0 -= h * dst_stride 588cabdff1aSopenharmony_ci mls r0, r1, r4, r0 589cabdff1aSopenharmony_ci @ r2 -= h * src_stride 590cabdff1aSopenharmony_ci mls r2, r3, r4, r2 591cabdff1aSopenharmony_ci @ r2 -= 8 * src_stride 592cabdff1aSopenharmony_ci sub r2, r2, r3, lsl #3 593cabdff1aSopenharmony_ci @ r2 += 1 * src_stride 594cabdff1aSopenharmony_ci add r2, r2, r3 595cabdff1aSopenharmony_ci add r2, r2, #8 596cabdff1aSopenharmony_ci add r0, r0, #8 597cabdff1aSopenharmony_ci b 1b 598cabdff1aSopenharmony_ci9: 599cabdff1aSopenharmony_ci vpop {q4-q7} 600cabdff1aSopenharmony_ci pop {r4-r5} 601cabdff1aSopenharmony_ci bx lr 602cabdff1aSopenharmony_ciendfunc 603cabdff1aSopenharmony_ci.endm 604cabdff1aSopenharmony_ci 605cabdff1aSopenharmony_cido_8tap_8v put, 3, 4 606cabdff1aSopenharmony_cido_8tap_8v put, 4, 3 607cabdff1aSopenharmony_cido_8tap_8v avg, 3, 4 608cabdff1aSopenharmony_cido_8tap_8v avg, 4, 3 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci@ Instantiate a vertical filter function for filtering a 4 pixels wide 611cabdff1aSopenharmony_ci@ slice. The first half of the registers contain one row, while the second 612cabdff1aSopenharmony_ci@ half of a register contains the second-next row (also stored in the first 613cabdff1aSopenharmony_ci@ half of the register two steps ahead). The convolution does two outputs 614cabdff1aSopenharmony_ci@ at a time; the output of q5-q12 into one, and q4-q13 into another one. 615cabdff1aSopenharmony_ci@ The first half of first output is the first output row, the first half 616cabdff1aSopenharmony_ci@ of the other output is the second output row. The second halves of the 617cabdff1aSopenharmony_ci@ registers are rows 3 and 4. 618cabdff1aSopenharmony_ci@ This only is designed to work for 4 or 8 output lines. 619cabdff1aSopenharmony_ci.macro do_8tap_4v type, idx1, idx2 620cabdff1aSopenharmony_cifunction \type\()_8tap_4v_\idx1\idx2 621cabdff1aSopenharmony_ci sub r2, r2, r3, lsl #1 622cabdff1aSopenharmony_ci sub r2, r2, r3 623cabdff1aSopenharmony_ci vld1.16 {q0}, [r12, :128] 624cabdff1aSopenharmony_ci 625cabdff1aSopenharmony_ci vld1.32 {d2[]}, [r2], r3 626cabdff1aSopenharmony_ci vld1.32 {d3[]}, [r2], r3 627cabdff1aSopenharmony_ci vld1.32 {d4[]}, [r2], r3 628cabdff1aSopenharmony_ci vld1.32 {d5[]}, [r2], r3 629cabdff1aSopenharmony_ci vld1.32 {d6[]}, [r2], r3 630cabdff1aSopenharmony_ci vld1.32 {d7[]}, [r2], r3 631cabdff1aSopenharmony_ci vext.8 d2, d2, d4, #4 632cabdff1aSopenharmony_ci vld1.32 {d8[]}, [r2], r3 633cabdff1aSopenharmony_ci vext.8 d3, d3, d5, #4 634cabdff1aSopenharmony_ci vld1.32 {d9[]}, [r2], r3 635cabdff1aSopenharmony_ci vmovl.u8 q5, d2 636cabdff1aSopenharmony_ci vext.8 d4, d4, d6, #4 637cabdff1aSopenharmony_ci vld1.32 {d28[]}, [r2], r3 638cabdff1aSopenharmony_ci vmovl.u8 q6, d3 639cabdff1aSopenharmony_ci vext.8 d5, d5, d7, #4 640cabdff1aSopenharmony_ci vld1.32 {d29[]}, [r2], r3 641cabdff1aSopenharmony_ci vmovl.u8 q7, d4 642cabdff1aSopenharmony_ci vext.8 d6, d6, d8, #4 643cabdff1aSopenharmony_ci vld1.32 {d30[]}, [r2], r3 644cabdff1aSopenharmony_ci vmovl.u8 q8, d5 645cabdff1aSopenharmony_ci vext.8 d7, d7, d9, #4 646cabdff1aSopenharmony_ci vmovl.u8 q9, d6 647cabdff1aSopenharmony_ci vext.8 d8, d8, d28, #4 648cabdff1aSopenharmony_ci vmovl.u8 q10, d7 649cabdff1aSopenharmony_ci vext.8 d9, d9, d29, #4 650cabdff1aSopenharmony_ci vmovl.u8 q11, d8 651cabdff1aSopenharmony_ci vext.8 d28, d28, d30, #4 652cabdff1aSopenharmony_ci vmovl.u8 q12, d9 653cabdff1aSopenharmony_ci vmovl.u8 q13, d28 654cabdff1aSopenharmony_ci 655cabdff1aSopenharmony_ci convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3 656cabdff1aSopenharmony_ci do_store4 q1, d2, q2, d4, d3, d5, \type 657cabdff1aSopenharmony_ci subs r4, r4, #4 658cabdff1aSopenharmony_ci beq 9f 659cabdff1aSopenharmony_ci 660cabdff1aSopenharmony_ci vld1.32 {d2[]}, [r2], r3 661cabdff1aSopenharmony_ci vld1.32 {d3[]}, [r2], r3 662cabdff1aSopenharmony_ci vext.8 d29, d29, d2, #4 663cabdff1aSopenharmony_ci vext.8 d30, d30, d3, #4 664cabdff1aSopenharmony_ci vld1.32 {d2[1]}, [r2], r3 665cabdff1aSopenharmony_ci vmovl.u8 q14, d29 666cabdff1aSopenharmony_ci vld1.32 {d3[1]}, [r2], r3 667cabdff1aSopenharmony_ci vmovl.u8 q15, d30 668cabdff1aSopenharmony_ci vmovl.u8 q5, d2 669cabdff1aSopenharmony_ci vmovl.u8 q6, d3 670cabdff1aSopenharmony_ci 671cabdff1aSopenharmony_ci convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3 672cabdff1aSopenharmony_ci do_store4 q1, d2, q2, d4, d3, d5, \type 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci9: 675cabdff1aSopenharmony_ci vpop {q4-q7} 676cabdff1aSopenharmony_ci pop {r4-r5} 677cabdff1aSopenharmony_ci bx lr 678cabdff1aSopenharmony_ciendfunc 679cabdff1aSopenharmony_ci.endm 680cabdff1aSopenharmony_ci 681cabdff1aSopenharmony_cido_8tap_4v put, 3, 4 682cabdff1aSopenharmony_cido_8tap_4v put, 4, 3 683cabdff1aSopenharmony_cido_8tap_4v avg, 3, 4 684cabdff1aSopenharmony_cido_8tap_4v avg, 4, 3 685cabdff1aSopenharmony_ci 686cabdff1aSopenharmony_ci.macro do_8tap_v_func type, filter, offset, size 687cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 688cabdff1aSopenharmony_ci push {r4-r5} 689cabdff1aSopenharmony_ci vpush {q4-q7} 690cabdff1aSopenharmony_ci ldr r4, [sp, #72] 691cabdff1aSopenharmony_ci movrelx r12, X(ff_vp9_subpel_filters), r5 692cabdff1aSopenharmony_ci ldr r5, [sp, #80] 693cabdff1aSopenharmony_ci add r12, r12, 256*\offset 694cabdff1aSopenharmony_ci add r12, r12, r5, lsl #4 695cabdff1aSopenharmony_ci cmp r5, #8 696cabdff1aSopenharmony_ci mov r5, #\size 697cabdff1aSopenharmony_ci.if \size >= 8 698cabdff1aSopenharmony_ci bge \type\()_8tap_8v_34 699cabdff1aSopenharmony_ci b \type\()_8tap_8v_43 700cabdff1aSopenharmony_ci.else 701cabdff1aSopenharmony_ci bge \type\()_8tap_4v_34 702cabdff1aSopenharmony_ci b \type\()_8tap_4v_43 703cabdff1aSopenharmony_ci.endif 704cabdff1aSopenharmony_ciendfunc 705cabdff1aSopenharmony_ci.endm 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci.macro do_8tap_v_filters size 708cabdff1aSopenharmony_cido_8tap_v_func put, regular, 1, \size 709cabdff1aSopenharmony_cido_8tap_v_func avg, regular, 1, \size 710cabdff1aSopenharmony_cido_8tap_v_func put, sharp, 2, \size 711cabdff1aSopenharmony_cido_8tap_v_func avg, sharp, 2, \size 712cabdff1aSopenharmony_cido_8tap_v_func put, smooth, 0, \size 713cabdff1aSopenharmony_cido_8tap_v_func avg, smooth, 0, \size 714cabdff1aSopenharmony_ci.endm 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_cido_8tap_v_filters 64 717cabdff1aSopenharmony_cido_8tap_v_filters 32 718cabdff1aSopenharmony_cido_8tap_v_filters 16 719cabdff1aSopenharmony_cido_8tap_v_filters 8 720cabdff1aSopenharmony_cido_8tap_v_filters 4 721