1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci@ All public functions in this file have the following signature: 24cabdff1aSopenharmony_ci@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25cabdff1aSopenharmony_ci@ const uint8_t *ref, ptrdiff_t ref_stride, 26cabdff1aSopenharmony_ci@ int h, int mx, int my); 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cifunction ff_vp9_copy128_neon, export=1 29cabdff1aSopenharmony_ci ldr r12, [sp] 30cabdff1aSopenharmony_ci sub r1, r1, #96 31cabdff1aSopenharmony_ci sub r3, r3, #96 32cabdff1aSopenharmony_ci1: 33cabdff1aSopenharmony_ci subs r12, r12, #1 34cabdff1aSopenharmony_ci vld1.16 {q0, q1}, [r2]! 35cabdff1aSopenharmony_ci vst1.16 {q0, q1}, [r0, :128]! 36cabdff1aSopenharmony_ci vld1.16 {q2, q3}, [r2]! 37cabdff1aSopenharmony_ci vst1.16 {q2, q3}, [r0, :128]! 38cabdff1aSopenharmony_ci vld1.16 {q8, q9}, [r2]! 39cabdff1aSopenharmony_ci vst1.16 {q8, q9}, [r0, :128]! 40cabdff1aSopenharmony_ci vld1.16 {q10, q11}, [r2], r3 41cabdff1aSopenharmony_ci vst1.16 {q10, q11}, [r0, :128], r1 42cabdff1aSopenharmony_ci bne 1b 43cabdff1aSopenharmony_ci bx lr 44cabdff1aSopenharmony_ciendfunc 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_cifunction ff_vp9_avg64_16_neon, export=1 47cabdff1aSopenharmony_ci push {lr} 48cabdff1aSopenharmony_ci ldr r12, [sp, #4] 49cabdff1aSopenharmony_ci sub r1, r1, #96 50cabdff1aSopenharmony_ci sub r3, r3, #96 51cabdff1aSopenharmony_ci mov lr, r0 52cabdff1aSopenharmony_ci1: 53cabdff1aSopenharmony_ci subs r12, r12, #1 54cabdff1aSopenharmony_ci vld1.16 {q8, q9}, [r2]! 55cabdff1aSopenharmony_ci vld1.16 {q0, q1}, [r0, :128]! 56cabdff1aSopenharmony_ci vld1.16 {q10, q11}, [r2]! 57cabdff1aSopenharmony_ci vrhadd.u16 q0, q0, q8 58cabdff1aSopenharmony_ci vld1.16 {q2, q3}, [r0, :128]! 59cabdff1aSopenharmony_ci vrhadd.u16 q1, q1, q9 60cabdff1aSopenharmony_ci vld1.16 {q12, q13}, [r2]! 61cabdff1aSopenharmony_ci vrhadd.u16 q2, q2, q10 62cabdff1aSopenharmony_ci vst1.16 {q0, q1}, [lr, :128]! 63cabdff1aSopenharmony_ci vrhadd.u16 q3, q3, q11 64cabdff1aSopenharmony_ci vld1.16 {q8, q9}, [r0, :128]! 65cabdff1aSopenharmony_ci vst1.16 {q2, q3}, [lr, :128]! 66cabdff1aSopenharmony_ci vrhadd.u16 q8, q8, q12 67cabdff1aSopenharmony_ci vld1.16 {q14, q15}, [r2], r3 68cabdff1aSopenharmony_ci vrhadd.u16 q9, q9, q13 69cabdff1aSopenharmony_ci vld1.16 {q10, q11}, [r0, :128], r1 70cabdff1aSopenharmony_ci vrhadd.u16 q10, q10, q14 71cabdff1aSopenharmony_ci vst1.16 {q8, q9}, [lr, :128]! 72cabdff1aSopenharmony_ci vrhadd.u16 q11, q11, q15 73cabdff1aSopenharmony_ci vst1.16 {q10, q11}, [lr, :128], r1 74cabdff1aSopenharmony_ci bne 1b 75cabdff1aSopenharmony_ci pop {pc} 76cabdff1aSopenharmony_ciendfunc 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_cifunction ff_vp9_avg32_16_neon, export=1 79cabdff1aSopenharmony_ci push {lr} 80cabdff1aSopenharmony_ci ldr r12, [sp, #4] 81cabdff1aSopenharmony_ci sub r1, r1, #32 82cabdff1aSopenharmony_ci sub r3, r3, #32 83cabdff1aSopenharmony_ci mov lr, r0 84cabdff1aSopenharmony_ci1: 85cabdff1aSopenharmony_ci subs r12, r12, #1 86cabdff1aSopenharmony_ci vld1.16 {q8, q9}, [r2]! 87cabdff1aSopenharmony_ci vld1.16 {q0, q1}, [r0, :128]! 88cabdff1aSopenharmony_ci vld1.16 {q10, q11}, [r2], r3 89cabdff1aSopenharmony_ci vrhadd.u16 q0, q0, q8 90cabdff1aSopenharmony_ci vld1.16 {q2, q3}, [r0, :128], r1 91cabdff1aSopenharmony_ci vrhadd.u16 q1, q1, q9 92cabdff1aSopenharmony_ci vrhadd.u16 q2, q2, q10 93cabdff1aSopenharmony_ci vst1.16 {q0, q1}, [lr, :128]! 94cabdff1aSopenharmony_ci vrhadd.u16 q3, q3, q11 95cabdff1aSopenharmony_ci vst1.16 {q2, q3}, [lr, :128], r1 96cabdff1aSopenharmony_ci bne 1b 97cabdff1aSopenharmony_ci pop {pc} 98cabdff1aSopenharmony_ciendfunc 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_cifunction ff_vp9_avg16_16_neon, export=1 101cabdff1aSopenharmony_ci ldr r12, [sp] 102cabdff1aSopenharmony_ci1: 103cabdff1aSopenharmony_ci subs r12, r12, #1 104cabdff1aSopenharmony_ci vld1.16 {q2, q3}, [r2], r3 105cabdff1aSopenharmony_ci vld1.16 {q0, q1}, [r0, :128] 106cabdff1aSopenharmony_ci vrhadd.u16 q0, q0, q2 107cabdff1aSopenharmony_ci vrhadd.u16 q1, q1, q3 108cabdff1aSopenharmony_ci vst1.16 {q0, q1}, [r0, :128], r1 109cabdff1aSopenharmony_ci bne 1b 110cabdff1aSopenharmony_ci bx lr 111cabdff1aSopenharmony_ciendfunc 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_cifunction ff_vp9_avg8_16_neon, export=1 114cabdff1aSopenharmony_ci push {lr} 115cabdff1aSopenharmony_ci ldr r12, [sp, #4] 116cabdff1aSopenharmony_ci mov lr, r0 117cabdff1aSopenharmony_ci1: 118cabdff1aSopenharmony_ci subs r12, r12, #2 119cabdff1aSopenharmony_ci vld1.16 {q2}, [r2], r3 120cabdff1aSopenharmony_ci vld1.16 {q0}, [r0, :128], r1 121cabdff1aSopenharmony_ci vld1.16 {q3}, [r2], r3 122cabdff1aSopenharmony_ci vrhadd.u16 q0, q0, q2 123cabdff1aSopenharmony_ci vld1.16 {q1}, [r0, :128], r1 124cabdff1aSopenharmony_ci vrhadd.u16 q1, q1, q3 125cabdff1aSopenharmony_ci vst1.16 {q0}, [lr, :128], r1 126cabdff1aSopenharmony_ci vst1.16 {q1}, [lr, :128], r1 127cabdff1aSopenharmony_ci bne 1b 128cabdff1aSopenharmony_ci pop {pc} 129cabdff1aSopenharmony_ciendfunc 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_cifunction ff_vp9_avg4_16_neon, export=1 132cabdff1aSopenharmony_ci ldr r12, [sp] 133cabdff1aSopenharmony_ci1: 134cabdff1aSopenharmony_ci subs r12, r12, #2 135cabdff1aSopenharmony_ci vld1.16 {d2}, [r2], r3 136cabdff1aSopenharmony_ci vld1.16 {d0}, [r0, :64], r1 137cabdff1aSopenharmony_ci vld1.16 {d3}, [r2], r3 138cabdff1aSopenharmony_ci vrhadd.u16 d0, d0, d2 139cabdff1aSopenharmony_ci vld1.16 {d1}, [r0, :64] 140cabdff1aSopenharmony_ci sub r0, r0, r1 141cabdff1aSopenharmony_ci vrhadd.u16 d1, d1, d3 142cabdff1aSopenharmony_ci vst1.16 {d0}, [r0, :64], r1 143cabdff1aSopenharmony_ci vst1.16 {d1}, [r0, :64], r1 144cabdff1aSopenharmony_ci bne 1b 145cabdff1aSopenharmony_ci bx lr 146cabdff1aSopenharmony_ciendfunc 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ci@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index 149cabdff1aSopenharmony_ci.macro vmull_lane dst, src, idx 150cabdff1aSopenharmony_ci.if \idx < 4 151cabdff1aSopenharmony_ci vmull.s16 \dst, \src, d0[\idx] 152cabdff1aSopenharmony_ci.else 153cabdff1aSopenharmony_ci vmull.s16 \dst, \src, d1[\idx - 4] 154cabdff1aSopenharmony_ci.endif 155cabdff1aSopenharmony_ci.endm 156cabdff1aSopenharmony_ci.macro vmlal_lane dst, src, idx 157cabdff1aSopenharmony_ci.if \idx < 4 158cabdff1aSopenharmony_ci vmlal.s16 \dst, \src, d0[\idx] 159cabdff1aSopenharmony_ci.else 160cabdff1aSopenharmony_ci vmlal.s16 \dst, \src, d1[\idx - 4] 161cabdff1aSopenharmony_ci.endif 162cabdff1aSopenharmony_ci.endm 163cabdff1aSopenharmony_ci 164cabdff1aSopenharmony_ci@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate 165cabdff1aSopenharmony_ci@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8) 166cabdff1aSopenharmony_ci.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size 167cabdff1aSopenharmony_ci vext.8 q14, \src1, \src2, #(2*\offset) 168cabdff1aSopenharmony_ci vext.8 q15, \src3, \src4, #(2*\offset) 169cabdff1aSopenharmony_ci vmlal_lane \dst1, d28, \offset 170cabdff1aSopenharmony_ci vmlal_lane \dst3, d30, \offset 171cabdff1aSopenharmony_ci.if \size >= 8 172cabdff1aSopenharmony_ci vmlal_lane \dst2, d29, \offset 173cabdff1aSopenharmony_ci vmlal_lane \dst4, d31, \offset 174cabdff1aSopenharmony_ci.endif 175cabdff1aSopenharmony_ci.endm 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci@ Instantiate a horizontal filter function for the given size. 179cabdff1aSopenharmony_ci@ This can work on 4 or 8 pixels in parallel; for larger 180cabdff1aSopenharmony_ci@ widths it will do 8 pixels at a time and loop horizontally. 181cabdff1aSopenharmony_ci@ The actual width (in bytes) is passed in r5, the height in r4 and 182cabdff1aSopenharmony_ci@ the filter coefficients in r12. 183cabdff1aSopenharmony_ci.macro do_8tap_h type, size 184cabdff1aSopenharmony_cifunction \type\()_8tap_\size\()h 185cabdff1aSopenharmony_ci sub r2, r2, #6 186cabdff1aSopenharmony_ci add r6, r0, r1 187cabdff1aSopenharmony_ci add r7, r2, r3 188cabdff1aSopenharmony_ci add r1, r1, r1 189cabdff1aSopenharmony_ci add r3, r3, r3 190cabdff1aSopenharmony_ci @ Only size >= 8 loops horizontally and needs 191cabdff1aSopenharmony_ci @ reduced dst stride 192cabdff1aSopenharmony_ci.if \size >= 8 193cabdff1aSopenharmony_ci sub r1, r1, r5 194cabdff1aSopenharmony_ci.endif 195cabdff1aSopenharmony_ci @ size >= 8 loads two qwords and increments r2, 196cabdff1aSopenharmony_ci @ for size 4 it's enough with three dwords and no 197cabdff1aSopenharmony_ci @ postincrement 198cabdff1aSopenharmony_ci.if \size >= 8 199cabdff1aSopenharmony_ci sub r3, r3, r5 200cabdff1aSopenharmony_ci sub r3, r3, #16 201cabdff1aSopenharmony_ci.endif 202cabdff1aSopenharmony_ci @ Load the filter vector 203cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 204cabdff1aSopenharmony_ci1: 205cabdff1aSopenharmony_ci.if \size >= 8 206cabdff1aSopenharmony_ci mov r12, r5 207cabdff1aSopenharmony_ci.endif 208cabdff1aSopenharmony_ci @ Load src 209cabdff1aSopenharmony_ci.if \size >= 8 210cabdff1aSopenharmony_ci vld1.16 {q8, q9}, [r2]! 211cabdff1aSopenharmony_ci vld1.16 {q10, q11}, [r7]! 212cabdff1aSopenharmony_ci.else 213cabdff1aSopenharmony_ci vld1.16 {d16, d17, d18}, [r2] 214cabdff1aSopenharmony_ci vld1.16 {d20, d21, d22}, [r7] 215cabdff1aSopenharmony_ci.endif 216cabdff1aSopenharmony_ci2: 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci vmull.s16 q1, d16, d0[0] 219cabdff1aSopenharmony_ci vmull.s16 q12, d20, d0[0] 220cabdff1aSopenharmony_ci.if \size >= 8 221cabdff1aSopenharmony_ci vmull.s16 q2, d17, d0[0] 222cabdff1aSopenharmony_ci vmull.s16 q13, d21, d0[0] 223cabdff1aSopenharmony_ci.endif 224cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 1, \size 225cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 2, \size 226cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 3, \size 227cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 4, \size 228cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 5, \size 229cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 6, \size 230cabdff1aSopenharmony_ci extmlal q1, q2, q12, q13, q8, q9, q10, q11, 7, \size 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci @ Round, shift and saturate. 233cabdff1aSopenharmony_ci @ The vqrshrun takes care of clamping negative values to zero, but 234cabdff1aSopenharmony_ci @ we manually need to do vmin with the max pixel value. 235cabdff1aSopenharmony_ci vqrshrun.s32 d2, q1, #7 236cabdff1aSopenharmony_ci vqrshrun.s32 d24, q12, #7 237cabdff1aSopenharmony_ci.if \size >= 8 238cabdff1aSopenharmony_ci vqrshrun.s32 d3, q2, #7 239cabdff1aSopenharmony_ci vqrshrun.s32 d25, q13, #7 240cabdff1aSopenharmony_ci vmin.u16 q1, q1, q3 241cabdff1aSopenharmony_ci vmin.u16 q12, q12, q3 242cabdff1aSopenharmony_ci.else 243cabdff1aSopenharmony_ci vmin.u16 d2, d2, d6 244cabdff1aSopenharmony_ci vmin.u16 d24, d24, d6 245cabdff1aSopenharmony_ci.endif 246cabdff1aSopenharmony_ci @ Average 247cabdff1aSopenharmony_ci.ifc \type,avg 248cabdff1aSopenharmony_ci.if \size >= 8 249cabdff1aSopenharmony_ci vld1.16 {q14}, [r0,:128] 250cabdff1aSopenharmony_ci vld1.16 {q15}, [r6,:128] 251cabdff1aSopenharmony_ci vrhadd.u16 q1, q1, q14 252cabdff1aSopenharmony_ci vrhadd.u16 q12, q12, q15 253cabdff1aSopenharmony_ci.else 254cabdff1aSopenharmony_ci vld1.16 {d28}, [r0,:64] 255cabdff1aSopenharmony_ci vld1.16 {d30}, [r6,:64] 256cabdff1aSopenharmony_ci vrhadd.u16 d2, d2, d28 257cabdff1aSopenharmony_ci vrhadd.u16 d24, d24, d30 258cabdff1aSopenharmony_ci.endif 259cabdff1aSopenharmony_ci.endif 260cabdff1aSopenharmony_ci @ Store and loop horizontally (for size >= 8) 261cabdff1aSopenharmony_ci.if \size >= 8 262cabdff1aSopenharmony_ci subs r12, r12, #16 263cabdff1aSopenharmony_ci vst1.16 {q1}, [r0,:128]! 264cabdff1aSopenharmony_ci vst1.16 {q12}, [r6,:128]! 265cabdff1aSopenharmony_ci beq 3f 266cabdff1aSopenharmony_ci vmov q8, q9 267cabdff1aSopenharmony_ci vmov q10, q11 268cabdff1aSopenharmony_ci vld1.16 {q9}, [r2]! 269cabdff1aSopenharmony_ci vld1.16 {q11}, [r7]! 270cabdff1aSopenharmony_ci b 2b 271cabdff1aSopenharmony_ci.else @ \size == 4 272cabdff1aSopenharmony_ci vst1.16 {d2}, [r0,:64] 273cabdff1aSopenharmony_ci vst1.16 {d24}, [r6,:64] 274cabdff1aSopenharmony_ci.endif 275cabdff1aSopenharmony_ci3: 276cabdff1aSopenharmony_ci @ Loop vertically 277cabdff1aSopenharmony_ci add r0, r0, r1 278cabdff1aSopenharmony_ci add r6, r6, r1 279cabdff1aSopenharmony_ci add r2, r2, r3 280cabdff1aSopenharmony_ci add r7, r7, r3 281cabdff1aSopenharmony_ci subs r4, r4, #2 282cabdff1aSopenharmony_ci bne 1b 283cabdff1aSopenharmony_ci pop {r4-r7} 284cabdff1aSopenharmony_ci bx lr 285cabdff1aSopenharmony_ciendfunc 286cabdff1aSopenharmony_ci.endm 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci.macro do_8tap_h_size size 289cabdff1aSopenharmony_cido_8tap_h put, \size 290cabdff1aSopenharmony_cido_8tap_h avg, \size 291cabdff1aSopenharmony_ci.endm 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_cido_8tap_h_size 4 294cabdff1aSopenharmony_cido_8tap_h_size 8 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci.macro do_8tap_h_func type, filter, offset, size, bpp 297cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1 298cabdff1aSopenharmony_ci push {r4-r7} 299cabdff1aSopenharmony_ci ldr r4, [sp, #16] 300cabdff1aSopenharmony_ci ldr r5, [sp, #20] 301cabdff1aSopenharmony_ci vmvn.u16 q3, #((0xffff << \bpp) & 0xffff) 302cabdff1aSopenharmony_ci movrelx r12, X(ff_vp9_subpel_filters), r6 303cabdff1aSopenharmony_ci add r12, r12, 256*\offset 304cabdff1aSopenharmony_ci add r12, r12, r5, lsl #4 305cabdff1aSopenharmony_ci mov r5, #2*\size 306cabdff1aSopenharmony_ci.if \size >= 8 307cabdff1aSopenharmony_ci b \type\()_8tap_8h 308cabdff1aSopenharmony_ci.else 309cabdff1aSopenharmony_ci b \type\()_8tap_4h 310cabdff1aSopenharmony_ci.endif 311cabdff1aSopenharmony_ciendfunc 312cabdff1aSopenharmony_ci.endm 313cabdff1aSopenharmony_ci 314cabdff1aSopenharmony_ci.macro do_8tap_h_filters size, bpp 315cabdff1aSopenharmony_cido_8tap_h_func put, regular, 1, \size, \bpp 316cabdff1aSopenharmony_cido_8tap_h_func avg, regular, 1, \size, \bpp 317cabdff1aSopenharmony_cido_8tap_h_func put, sharp, 2, \size, \bpp 318cabdff1aSopenharmony_cido_8tap_h_func avg, sharp, 2, \size, \bpp 319cabdff1aSopenharmony_cido_8tap_h_func put, smooth, 0, \size, \bpp 320cabdff1aSopenharmony_cido_8tap_h_func avg, smooth, 0, \size, \bpp 321cabdff1aSopenharmony_ci.endm 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci.macro do_8tap_h_filters_bpp bpp 324cabdff1aSopenharmony_cido_8tap_h_filters 64, \bpp 325cabdff1aSopenharmony_cido_8tap_h_filters 32, \bpp 326cabdff1aSopenharmony_cido_8tap_h_filters 16, \bpp 327cabdff1aSopenharmony_cido_8tap_h_filters 8, \bpp 328cabdff1aSopenharmony_cido_8tap_h_filters 4, \bpp 329cabdff1aSopenharmony_ci.endm 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_cido_8tap_h_filters_bpp 10 332cabdff1aSopenharmony_cido_8tap_h_filters_bpp 12 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ci.ltorg 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci@ Vertical filters 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ci@ Round, shift and saturate and store qreg1-4 339cabdff1aSopenharmony_ci.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type 340cabdff1aSopenharmony_ci vqrshrun.s32 \dreg1, \qreg1, #7 341cabdff1aSopenharmony_ci vqrshrun.s32 \dreg2, \qreg2, #7 342cabdff1aSopenharmony_ci vqrshrun.s32 \dreg3, \qreg3, #7 343cabdff1aSopenharmony_ci vqrshrun.s32 \dreg4, \qreg4, #7 344cabdff1aSopenharmony_ci.ifc \type,avg 345cabdff1aSopenharmony_ci vld1.16 {\tmp1}, [r6,:64], r1 346cabdff1aSopenharmony_ci vld1.16 {\tmp2}, [r6,:64], r1 347cabdff1aSopenharmony_ci vld1.16 {\tmp3}, [r6,:64], r1 348cabdff1aSopenharmony_ci vld1.16 {\tmp4}, [r6,:64], r1 349cabdff1aSopenharmony_ci.endif 350cabdff1aSopenharmony_ci vmin.u16 \dreg1, \dreg1, \minreg 351cabdff1aSopenharmony_ci vmin.u16 \dreg2, \dreg2, \minreg 352cabdff1aSopenharmony_ci vmin.u16 \dreg3, \dreg3, \minreg 353cabdff1aSopenharmony_ci vmin.u16 \dreg4, \dreg4, \minreg 354cabdff1aSopenharmony_ci.ifc \type,avg 355cabdff1aSopenharmony_ci vrhadd.u16 \dreg1, \dreg1, \tmp1 356cabdff1aSopenharmony_ci vrhadd.u16 \dreg2, \dreg2, \tmp2 357cabdff1aSopenharmony_ci vrhadd.u16 \dreg3, \dreg3, \tmp3 358cabdff1aSopenharmony_ci vrhadd.u16 \dreg4, \dreg4, \tmp4 359cabdff1aSopenharmony_ci.endif 360cabdff1aSopenharmony_ci vst1.16 {\dreg1}, [r0,:64], r1 361cabdff1aSopenharmony_ci vst1.16 {\dreg2}, [r0,:64], r1 362cabdff1aSopenharmony_ci vst1.16 {\dreg3}, [r0,:64], r1 363cabdff1aSopenharmony_ci vst1.16 {\dreg4}, [r0,:64], r1 364cabdff1aSopenharmony_ci.endm 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci@ Round, shift and saturate and store qreg1-4 367cabdff1aSopenharmony_ci@ qreg1-2 belong to one line and qreg3-4 to the second line. 368cabdff1aSopenharmony_ci@ dreg1-2 == qreg1, dreg3-4 == qreg2. 369cabdff1aSopenharmony_ci.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type 370cabdff1aSopenharmony_ci vqrshrun.s32 \dreg1, \qreg1, #7 371cabdff1aSopenharmony_ci vqrshrun.s32 \dreg2, \qreg2, #7 372cabdff1aSopenharmony_ci vqrshrun.s32 \dreg3, \qreg3, #7 373cabdff1aSopenharmony_ci vqrshrun.s32 \dreg4, \qreg4, #7 374cabdff1aSopenharmony_ci.ifc \type,avg 375cabdff1aSopenharmony_ci vld1.16 {\qreg3}, [r6,:128], r1 376cabdff1aSopenharmony_ci vld1.16 {\qreg4}, [r6,:128], r1 377cabdff1aSopenharmony_ci.endif 378cabdff1aSopenharmony_ci vmin.u16 \qreg1, \qreg1, \minreg 379cabdff1aSopenharmony_ci vmin.u16 \qreg2, \qreg2, \minreg 380cabdff1aSopenharmony_ci.ifc \type,avg 381cabdff1aSopenharmony_ci vrhadd.u16 \qreg1, \qreg1, \qreg3 382cabdff1aSopenharmony_ci vrhadd.u16 \qreg2, \qreg2, \qreg4 383cabdff1aSopenharmony_ci.endif 384cabdff1aSopenharmony_ci vst1.16 {\qreg1}, [r0,:128], r1 385cabdff1aSopenharmony_ci vst1.16 {\qreg2}, [r0,:128], r1 386cabdff1aSopenharmony_ci.endm 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ci@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 389cabdff1aSopenharmony_ci@ (src1-src8 into dst1, src2-src9 into dst2). 390cabdff1aSopenharmony_ci.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2 391cabdff1aSopenharmony_ci vmull.s16 \dst1, \src1, d0[0] 392cabdff1aSopenharmony_ci vmull.s16 \dst2, \src2, d0[0] 393cabdff1aSopenharmony_ci vmull.s16 \tmp1, \src2, d0[1] 394cabdff1aSopenharmony_ci vmull.s16 \tmp2, \src3, d0[1] 395cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src3, d0[2] 396cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src4, d0[2] 397cabdff1aSopenharmony_ci vmlal.s16 \tmp1, \src4, d0[3] 398cabdff1aSopenharmony_ci vmlal.s16 \tmp2, \src5, d0[3] 399cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src5, d1[0] 400cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src6, d1[0] 401cabdff1aSopenharmony_ci vmlal.s16 \tmp1, \src6, d1[1] 402cabdff1aSopenharmony_ci vmlal.s16 \tmp2, \src7, d1[1] 403cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src7, d1[2] 404cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src8, d1[2] 405cabdff1aSopenharmony_ci vmlal.s16 \tmp1, \src8, d1[3] 406cabdff1aSopenharmony_ci vmlal.s16 \tmp2, \src9, d1[3] 407cabdff1aSopenharmony_ci vadd.s32 \dst1, \dst1, \tmp1 408cabdff1aSopenharmony_ci vadd.s32 \dst2, \dst2, \tmp2 409cabdff1aSopenharmony_ci.endm 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci@ Evaluate the filter twice in parallel. This does the same as convolve4 above, 412cabdff1aSopenharmony_ci@ but with double width (two input/output registers per row). 413cabdff1aSopenharmony_ci.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18 414cabdff1aSopenharmony_ci vmull.s16 \dst1, \src1, d0[0] 415cabdff1aSopenharmony_ci vmull.s16 \dst2, \src2, d0[0] 416cabdff1aSopenharmony_ci vmull.s16 \dst3, \src3, d0[0] 417cabdff1aSopenharmony_ci vmull.s16 \dst4, \src4, d0[0] 418cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src3, d0[1] 419cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src4, d0[1] 420cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src5, d0[1] 421cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src6, d0[1] 422cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src5, d0[2] 423cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src6, d0[2] 424cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src7, d0[2] 425cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src8, d0[2] 426cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src7, d0[3] 427cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src8, d0[3] 428cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src9, d0[3] 429cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src10, d0[3] 430cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src9, d1[0] 431cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src10, d1[0] 432cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src11, d1[0] 433cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src12, d1[0] 434cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src11, d1[1] 435cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src12, d1[1] 436cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src13, d1[1] 437cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src14, d1[1] 438cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src13, d1[2] 439cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src14, d1[2] 440cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src15, d1[2] 441cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src16, d1[2] 442cabdff1aSopenharmony_ci vmlal.s16 \dst1, \src15, d1[3] 443cabdff1aSopenharmony_ci vmlal.s16 \dst2, \src16, d1[3] 444cabdff1aSopenharmony_ci vmlal.s16 \dst3, \src17, d1[3] 445cabdff1aSopenharmony_ci vmlal.s16 \dst4, \src18, d1[3] 446cabdff1aSopenharmony_ci.endm 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci@ Instantiate a vertical filter function for filtering 8 pixels at a time. 449cabdff1aSopenharmony_ci@ The height is passed in r4, the width in r5 and the filter coefficients 450cabdff1aSopenharmony_ci@ in r12. 451cabdff1aSopenharmony_ci.macro do_8tap_8v type 452cabdff1aSopenharmony_cifunction \type\()_8tap_8v 453cabdff1aSopenharmony_ci sub r2, r2, r3, lsl #1 454cabdff1aSopenharmony_ci sub r2, r2, r3 455cabdff1aSopenharmony_ci vld1.16 {q0}, [r12, :128] 456cabdff1aSopenharmony_ci1: 457cabdff1aSopenharmony_ci.ifc \type,avg 458cabdff1aSopenharmony_ci mov r6, r0 459cabdff1aSopenharmony_ci.endif 460cabdff1aSopenharmony_ci mov r12, r4 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci vld1.16 {q5}, [r2], r3 463cabdff1aSopenharmony_ci vld1.16 {q6}, [r2], r3 464cabdff1aSopenharmony_ci vld1.16 {q7}, [r2], r3 465cabdff1aSopenharmony_ci vld1.16 {q8}, [r2], r3 466cabdff1aSopenharmony_ci vld1.16 {q9}, [r2], r3 467cabdff1aSopenharmony_ci vld1.16 {q10}, [r2], r3 468cabdff1aSopenharmony_ci vld1.16 {q11}, [r2], r3 469cabdff1aSopenharmony_ci2: 470cabdff1aSopenharmony_ci vld1.16 {q12}, [r2], r3 471cabdff1aSopenharmony_ci vld1.16 {q13}, [r2], r3 472cabdff1aSopenharmony_ci vld1.16 {q14}, [r2], r3 473cabdff1aSopenharmony_ci vld1.16 {q15}, [r2], r3 474cabdff1aSopenharmony_ci convolve8 q2, q3, q4, q5, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27 475cabdff1aSopenharmony_ci do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type 476cabdff1aSopenharmony_ci convolve8 q2, q3, q4, q5, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 477cabdff1aSopenharmony_ci do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_ci subs r12, r12, #4 480cabdff1aSopenharmony_ci beq 8f 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_ci vld1.16 {q4}, [r2], r3 483cabdff1aSopenharmony_ci vld1.16 {q5}, [r2], r3 484cabdff1aSopenharmony_ci vld1.16 {q6}, [r2], r3 485cabdff1aSopenharmony_ci vld1.16 {q7}, [r2], r3 486cabdff1aSopenharmony_ci convolve8 q2, q3, q8, q9, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11 487cabdff1aSopenharmony_ci do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type 488cabdff1aSopenharmony_ci convolve8 q2, q3, q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15 489cabdff1aSopenharmony_ci do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci subs r12, r12, #4 492cabdff1aSopenharmony_ci beq 8f 493cabdff1aSopenharmony_ci 494cabdff1aSopenharmony_ci vld1.16 {q8}, [r2], r3 495cabdff1aSopenharmony_ci vld1.16 {q9}, [r2], r3 496cabdff1aSopenharmony_ci vld1.16 {q10}, [r2], r3 497cabdff1aSopenharmony_ci vld1.16 {q11}, [r2], r3 498cabdff1aSopenharmony_ci convolve8 q2, q3, q12, q13, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19 499cabdff1aSopenharmony_ci do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type 500cabdff1aSopenharmony_ci convolve8 q2, q3, q12, q13, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23 501cabdff1aSopenharmony_ci do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci subs r12, r12, #4 504cabdff1aSopenharmony_ci bne 2b 505cabdff1aSopenharmony_ci 506cabdff1aSopenharmony_ci8: 507cabdff1aSopenharmony_ci subs r5, r5, #8 508cabdff1aSopenharmony_ci beq 9f 509cabdff1aSopenharmony_ci @ r0 -= h * dst_stride 510cabdff1aSopenharmony_ci mls r0, r1, r4, r0 511cabdff1aSopenharmony_ci @ r2 -= h * src_stride 512cabdff1aSopenharmony_ci mls r2, r3, r4, r2 513cabdff1aSopenharmony_ci @ r2 -= 8 * src_stride 514cabdff1aSopenharmony_ci sub r2, r2, r3, lsl #3 515cabdff1aSopenharmony_ci @ r2 += 1 * src_stride 516cabdff1aSopenharmony_ci add r2, r2, r3 517cabdff1aSopenharmony_ci add r2, r2, #16 518cabdff1aSopenharmony_ci add r0, r0, #16 519cabdff1aSopenharmony_ci b 1b 520cabdff1aSopenharmony_ci9: 521cabdff1aSopenharmony_ci vpop {q4-q7} 522cabdff1aSopenharmony_ci pop {r4-r6} 523cabdff1aSopenharmony_ci bx lr 524cabdff1aSopenharmony_ciendfunc 525cabdff1aSopenharmony_ci.endm 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_cido_8tap_8v put 528cabdff1aSopenharmony_cido_8tap_8v avg 529cabdff1aSopenharmony_ci 530cabdff1aSopenharmony_ci@ Instantiate a vertical filter function for filtering a 4 pixels wide 531cabdff1aSopenharmony_ci@ slice. This only is designed to work for 4 or 8 output lines. 532cabdff1aSopenharmony_ci.macro do_8tap_4v type 533cabdff1aSopenharmony_cifunction \type\()_8tap_4v 534cabdff1aSopenharmony_ci sub r2, r2, r3, lsl #1 535cabdff1aSopenharmony_ci sub r2, r2, r3 536cabdff1aSopenharmony_ci vld1.16 {q0}, [r12, :128] 537cabdff1aSopenharmony_ci.ifc \type,avg 538cabdff1aSopenharmony_ci mov r6, r0 539cabdff1aSopenharmony_ci.endif 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci vld1.16 {d16}, [r2], r3 542cabdff1aSopenharmony_ci vld1.16 {d17}, [r2], r3 543cabdff1aSopenharmony_ci vld1.16 {d18}, [r2], r3 544cabdff1aSopenharmony_ci vld1.16 {d19}, [r2], r3 545cabdff1aSopenharmony_ci vld1.16 {d20}, [r2], r3 546cabdff1aSopenharmony_ci vld1.16 {d21}, [r2], r3 547cabdff1aSopenharmony_ci vld1.16 {d22}, [r2], r3 548cabdff1aSopenharmony_ci vld1.16 {d23}, [r2], r3 549cabdff1aSopenharmony_ci vld1.16 {d24}, [r2], r3 550cabdff1aSopenharmony_ci vld1.16 {d25}, [r2], r3 551cabdff1aSopenharmony_ci vld1.16 {d26}, [r2], r3 552cabdff1aSopenharmony_ci convolve4 q2, q3, d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15 553cabdff1aSopenharmony_ci convolve4 q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8, q9 554cabdff1aSopenharmony_ci do_store4 q2, d4, q3, d6, q14, d28, q15, d30, d5, d7, d29, d31, d2, \type 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_ci subs r4, r4, #4 557cabdff1aSopenharmony_ci beq 9f 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci vld1.16 {d27}, [r2], r3 560cabdff1aSopenharmony_ci vld1.16 {d28}, [r2], r3 561cabdff1aSopenharmony_ci vld1.16 {d29}, [r2], r3 562cabdff1aSopenharmony_ci vld1.16 {d30}, [r2], r3 563cabdff1aSopenharmony_ci convolve4 q2, q3, d20, d21, d22, d23, d24, d25, d26, d27, d28, q8, q9 564cabdff1aSopenharmony_ci convolve4 q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11 565cabdff1aSopenharmony_ci do_store4 q2, d4, q3, d6, q8, d16, q9, d18, d5, d7, d17, d19, d2, \type 566cabdff1aSopenharmony_ci 567cabdff1aSopenharmony_ci9: 568cabdff1aSopenharmony_ci pop {r4-r6} 569cabdff1aSopenharmony_ci bx lr 570cabdff1aSopenharmony_ciendfunc 571cabdff1aSopenharmony_ci.endm 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_cido_8tap_4v put 574cabdff1aSopenharmony_cido_8tap_4v avg 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci.macro do_8tap_v_func type, filter, offset, size, bpp 577cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1 578cabdff1aSopenharmony_ci push {r4-r6} 579cabdff1aSopenharmony_ci ldr r4, [sp, #12] 580cabdff1aSopenharmony_ci ldr r5, [sp, #20] 581cabdff1aSopenharmony_ci.if \size >= 8 582cabdff1aSopenharmony_ci vpush {q4-q7} 583cabdff1aSopenharmony_ci.endif 584cabdff1aSopenharmony_ci vmvn.u16 q1, #((0xffff << \bpp) & 0xffff) 585cabdff1aSopenharmony_ci movrelx r12, X(ff_vp9_subpel_filters), r6 586cabdff1aSopenharmony_ci add r12, r12, 256*\offset 587cabdff1aSopenharmony_ci add r12, r12, r5, lsl #4 588cabdff1aSopenharmony_ci mov r5, #\size 589cabdff1aSopenharmony_ci.if \size >= 8 590cabdff1aSopenharmony_ci b \type\()_8tap_8v 591cabdff1aSopenharmony_ci.else 592cabdff1aSopenharmony_ci b \type\()_8tap_4v 593cabdff1aSopenharmony_ci.endif 594cabdff1aSopenharmony_ciendfunc 595cabdff1aSopenharmony_ci.endm 596cabdff1aSopenharmony_ci 597cabdff1aSopenharmony_ci.macro do_8tap_v_filters size, bpp 598cabdff1aSopenharmony_cido_8tap_v_func put, regular, 1, \size, \bpp 599cabdff1aSopenharmony_cido_8tap_v_func avg, regular, 1, \size, \bpp 600cabdff1aSopenharmony_cido_8tap_v_func put, sharp, 2, \size, \bpp 601cabdff1aSopenharmony_cido_8tap_v_func avg, sharp, 2, \size, \bpp 602cabdff1aSopenharmony_cido_8tap_v_func put, smooth, 0, \size, \bpp 603cabdff1aSopenharmony_cido_8tap_v_func avg, smooth, 0, \size, \bpp 604cabdff1aSopenharmony_ci.endm 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci.macro do_8tap_v_filters_bpp bpp 607cabdff1aSopenharmony_cido_8tap_v_filters 64, \bpp 608cabdff1aSopenharmony_cido_8tap_v_filters 32, \bpp 609cabdff1aSopenharmony_cido_8tap_v_filters 16, \bpp 610cabdff1aSopenharmony_cido_8tap_v_filters 8, \bpp 611cabdff1aSopenharmony_cido_8tap_v_filters 4, \bpp 612cabdff1aSopenharmony_ci.endm 613cabdff1aSopenharmony_ci 614cabdff1aSopenharmony_cido_8tap_v_filters_bpp 10 615cabdff1aSopenharmony_cido_8tap_v_filters_bpp 12 616