1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci// All public functions in this file have the following signature: 24cabdff1aSopenharmony_ci// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25cabdff1aSopenharmony_ci// const uint8_t *ref, ptrdiff_t ref_stride, 26cabdff1aSopenharmony_ci// int h, int mx, int my); 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cifunction ff_vp9_avg64_neon, export=1 29cabdff1aSopenharmony_ci mov x5, x0 30cabdff1aSopenharmony_ci1: 31cabdff1aSopenharmony_ci ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3 32cabdff1aSopenharmony_ci ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 33cabdff1aSopenharmony_ci ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 34cabdff1aSopenharmony_ci urhadd v0.16b, v0.16b, v4.16b 35cabdff1aSopenharmony_ci urhadd v1.16b, v1.16b, v5.16b 36cabdff1aSopenharmony_ci ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 37cabdff1aSopenharmony_ci urhadd v2.16b, v2.16b, v6.16b 38cabdff1aSopenharmony_ci urhadd v3.16b, v3.16b, v7.16b 39cabdff1aSopenharmony_ci subs w4, w4, #2 40cabdff1aSopenharmony_ci urhadd v16.16b, v16.16b, v20.16b 41cabdff1aSopenharmony_ci urhadd v17.16b, v17.16b, v21.16b 42cabdff1aSopenharmony_ci st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1 43cabdff1aSopenharmony_ci urhadd v18.16b, v18.16b, v22.16b 44cabdff1aSopenharmony_ci urhadd v19.16b, v19.16b, v23.16b 45cabdff1aSopenharmony_ci st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1 46cabdff1aSopenharmony_ci b.ne 1b 47cabdff1aSopenharmony_ci ret 48cabdff1aSopenharmony_ciendfunc 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_cifunction ff_vp9_avg32_neon, export=1 51cabdff1aSopenharmony_ci1: 52cabdff1aSopenharmony_ci ld1 {v2.16b, v3.16b}, [x2], x3 53cabdff1aSopenharmony_ci ld1 {v0.16b, v1.16b}, [x0] 54cabdff1aSopenharmony_ci urhadd v0.16b, v0.16b, v2.16b 55cabdff1aSopenharmony_ci urhadd v1.16b, v1.16b, v3.16b 56cabdff1aSopenharmony_ci subs w4, w4, #1 57cabdff1aSopenharmony_ci st1 {v0.16b, v1.16b}, [x0], x1 58cabdff1aSopenharmony_ci b.ne 1b 59cabdff1aSopenharmony_ci ret 60cabdff1aSopenharmony_ciendfunc 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_cifunction ff_vp9_copy16_neon, export=1 63cabdff1aSopenharmony_ci add x5, x0, x1 64cabdff1aSopenharmony_ci lsl x1, x1, #1 65cabdff1aSopenharmony_ci add x6, x2, x3 66cabdff1aSopenharmony_ci lsl x3, x3, #1 67cabdff1aSopenharmony_ci1: 68cabdff1aSopenharmony_ci ld1 {v0.16b}, [x2], x3 69cabdff1aSopenharmony_ci ld1 {v1.16b}, [x6], x3 70cabdff1aSopenharmony_ci ld1 {v2.16b}, [x2], x3 71cabdff1aSopenharmony_ci ld1 {v3.16b}, [x6], x3 72cabdff1aSopenharmony_ci subs w4, w4, #4 73cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 74cabdff1aSopenharmony_ci st1 {v1.16b}, [x5], x1 75cabdff1aSopenharmony_ci st1 {v2.16b}, [x0], x1 76cabdff1aSopenharmony_ci st1 {v3.16b}, [x5], x1 77cabdff1aSopenharmony_ci b.ne 1b 78cabdff1aSopenharmony_ci ret 79cabdff1aSopenharmony_ciendfunc 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_cifunction ff_vp9_avg16_neon, export=1 82cabdff1aSopenharmony_ci mov x5, x0 83cabdff1aSopenharmony_ci1: 84cabdff1aSopenharmony_ci ld1 {v2.16b}, [x2], x3 85cabdff1aSopenharmony_ci ld1 {v0.16b}, [x0], x1 86cabdff1aSopenharmony_ci ld1 {v3.16b}, [x2], x3 87cabdff1aSopenharmony_ci urhadd v0.16b, v0.16b, v2.16b 88cabdff1aSopenharmony_ci ld1 {v1.16b}, [x0], x1 89cabdff1aSopenharmony_ci urhadd v1.16b, v1.16b, v3.16b 90cabdff1aSopenharmony_ci subs w4, w4, #2 91cabdff1aSopenharmony_ci st1 {v0.16b}, [x5], x1 92cabdff1aSopenharmony_ci st1 {v1.16b}, [x5], x1 93cabdff1aSopenharmony_ci b.ne 1b 94cabdff1aSopenharmony_ci ret 95cabdff1aSopenharmony_ciendfunc 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_cifunction ff_vp9_copy8_neon, export=1 98cabdff1aSopenharmony_ci1: 99cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2], x3 100cabdff1aSopenharmony_ci ld1 {v1.8b}, [x2], x3 101cabdff1aSopenharmony_ci subs w4, w4, #2 102cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 103cabdff1aSopenharmony_ci st1 {v1.8b}, [x0], x1 104cabdff1aSopenharmony_ci b.ne 1b 105cabdff1aSopenharmony_ci ret 106cabdff1aSopenharmony_ciendfunc 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_cifunction ff_vp9_avg8_neon, export=1 109cabdff1aSopenharmony_ci mov x5, x0 110cabdff1aSopenharmony_ci1: 111cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 112cabdff1aSopenharmony_ci ld1 {v0.8b}, [x0], x1 113cabdff1aSopenharmony_ci ld1 {v3.8b}, [x2], x3 114cabdff1aSopenharmony_ci urhadd v0.8b, v0.8b, v2.8b 115cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x1 116cabdff1aSopenharmony_ci urhadd v1.8b, v1.8b, v3.8b 117cabdff1aSopenharmony_ci subs w4, w4, #2 118cabdff1aSopenharmony_ci st1 {v0.8b}, [x5], x1 119cabdff1aSopenharmony_ci st1 {v1.8b}, [x5], x1 120cabdff1aSopenharmony_ci b.ne 1b 121cabdff1aSopenharmony_ci ret 122cabdff1aSopenharmony_ciendfunc 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_cifunction ff_vp9_copy4_neon, export=1 125cabdff1aSopenharmony_ci1: 126cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x2], x3 127cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x2], x3 128cabdff1aSopenharmony_ci st1 {v0.s}[0], [x0], x1 129cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2], x3 130cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0], x1 131cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x2], x3 132cabdff1aSopenharmony_ci subs w4, w4, #4 133cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 134cabdff1aSopenharmony_ci st1 {v3.s}[0], [x0], x1 135cabdff1aSopenharmony_ci b.ne 1b 136cabdff1aSopenharmony_ci ret 137cabdff1aSopenharmony_ciendfunc 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_cifunction ff_vp9_avg4_neon, export=1 140cabdff1aSopenharmony_ci mov x5, x0 141cabdff1aSopenharmony_ci1: 142cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2], x3 143cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x0], x1 144cabdff1aSopenharmony_ci ld1 {v2.s}[1], [x2], x3 145cabdff1aSopenharmony_ci ld1 {v0.s}[1], [x0], x1 146cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x2], x3 147cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x0], x1 148cabdff1aSopenharmony_ci ld1 {v3.s}[1], [x2], x3 149cabdff1aSopenharmony_ci ld1 {v1.s}[1], [x0], x1 150cabdff1aSopenharmony_ci subs w4, w4, #4 151cabdff1aSopenharmony_ci urhadd v0.8b, v0.8b, v2.8b 152cabdff1aSopenharmony_ci urhadd v1.8b, v1.8b, v3.8b 153cabdff1aSopenharmony_ci st1 {v0.s}[0], [x5], x1 154cabdff1aSopenharmony_ci st1 {v0.s}[1], [x5], x1 155cabdff1aSopenharmony_ci st1 {v1.s}[0], [x5], x1 156cabdff1aSopenharmony_ci st1 {v1.s}[1], [x5], x1 157cabdff1aSopenharmony_ci b.ne 1b 158cabdff1aSopenharmony_ci ret 159cabdff1aSopenharmony_ciendfunc 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 163cabdff1aSopenharmony_ci// for size >= 16), and multiply-accumulate into dst1 and dst3 (or 164cabdff1aSopenharmony_ci// dst1-dst2 and dst3-dst4 for size >= 16) 165cabdff1aSopenharmony_ci.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size 166cabdff1aSopenharmony_ci ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) 167cabdff1aSopenharmony_ci ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) 168cabdff1aSopenharmony_ci.if \size >= 16 169cabdff1aSopenharmony_ci mla \dst1\().8h, v20.8h, v0.h[\offset] 170cabdff1aSopenharmony_ci ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) 171cabdff1aSopenharmony_ci mla \dst3\().8h, v22.8h, v0.h[\offset] 172cabdff1aSopenharmony_ci ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) 173cabdff1aSopenharmony_ci mla \dst2\().8h, v21.8h, v0.h[\offset] 174cabdff1aSopenharmony_ci mla \dst4\().8h, v23.8h, v0.h[\offset] 175cabdff1aSopenharmony_ci.elseif \size == 8 176cabdff1aSopenharmony_ci mla \dst1\().8h, v20.8h, v0.h[\offset] 177cabdff1aSopenharmony_ci mla \dst3\().8h, v22.8h, v0.h[\offset] 178cabdff1aSopenharmony_ci.else 179cabdff1aSopenharmony_ci mla \dst1\().4h, v20.4h, v0.h[\offset] 180cabdff1aSopenharmony_ci mla \dst3\().4h, v22.4h, v0.h[\offset] 181cabdff1aSopenharmony_ci.endif 182cabdff1aSopenharmony_ci.endm 183cabdff1aSopenharmony_ci// The same as above, but don't accumulate straight into the 184cabdff1aSopenharmony_ci// destination, but use a temp register and accumulate with saturation. 185cabdff1aSopenharmony_ci.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size 186cabdff1aSopenharmony_ci ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) 187cabdff1aSopenharmony_ci ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) 188cabdff1aSopenharmony_ci.if \size >= 16 189cabdff1aSopenharmony_ci mul v20.8h, v20.8h, v0.h[\offset] 190cabdff1aSopenharmony_ci ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) 191cabdff1aSopenharmony_ci mul v22.8h, v22.8h, v0.h[\offset] 192cabdff1aSopenharmony_ci ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) 193cabdff1aSopenharmony_ci mul v21.8h, v21.8h, v0.h[\offset] 194cabdff1aSopenharmony_ci mul v23.8h, v23.8h, v0.h[\offset] 195cabdff1aSopenharmony_ci.elseif \size == 8 196cabdff1aSopenharmony_ci mul v20.8h, v20.8h, v0.h[\offset] 197cabdff1aSopenharmony_ci mul v22.8h, v22.8h, v0.h[\offset] 198cabdff1aSopenharmony_ci.else 199cabdff1aSopenharmony_ci mul v20.4h, v20.4h, v0.h[\offset] 200cabdff1aSopenharmony_ci mul v22.4h, v22.4h, v0.h[\offset] 201cabdff1aSopenharmony_ci.endif 202cabdff1aSopenharmony_ci.if \size == 4 203cabdff1aSopenharmony_ci sqadd \dst1\().4h, \dst1\().4h, v20.4h 204cabdff1aSopenharmony_ci sqadd \dst3\().4h, \dst3\().4h, v22.4h 205cabdff1aSopenharmony_ci.else 206cabdff1aSopenharmony_ci sqadd \dst1\().8h, \dst1\().8h, v20.8h 207cabdff1aSopenharmony_ci sqadd \dst3\().8h, \dst3\().8h, v22.8h 208cabdff1aSopenharmony_ci.if \size >= 16 209cabdff1aSopenharmony_ci sqadd \dst2\().8h, \dst2\().8h, v21.8h 210cabdff1aSopenharmony_ci sqadd \dst4\().8h, \dst4\().8h, v23.8h 211cabdff1aSopenharmony_ci.endif 212cabdff1aSopenharmony_ci.endif 213cabdff1aSopenharmony_ci.endm 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci// Instantiate a horizontal filter function for the given size. 217cabdff1aSopenharmony_ci// This can work on 4, 8 or 16 pixels in parallel; for larger 218cabdff1aSopenharmony_ci// widths it will do 16 pixels at a time and loop horizontally. 219cabdff1aSopenharmony_ci// The actual width is passed in x5, the height in w4 and the 220cabdff1aSopenharmony_ci// filter coefficients in x9. idx2 is the index of the largest 221cabdff1aSopenharmony_ci// filter coefficient (3 or 4) and idx1 is the other one of them. 222cabdff1aSopenharmony_ci.macro do_8tap_h type, size, idx1, idx2 223cabdff1aSopenharmony_cifunction \type\()_8tap_\size\()h_\idx1\idx2 224cabdff1aSopenharmony_ci sub x2, x2, #3 225cabdff1aSopenharmony_ci add x6, x0, x1 226cabdff1aSopenharmony_ci add x7, x2, x3 227cabdff1aSopenharmony_ci add x1, x1, x1 228cabdff1aSopenharmony_ci add x3, x3, x3 229cabdff1aSopenharmony_ci // Only size >= 16 loops horizontally and needs 230cabdff1aSopenharmony_ci // reduced dst stride 231cabdff1aSopenharmony_ci.if \size >= 16 232cabdff1aSopenharmony_ci sub x1, x1, x5 233cabdff1aSopenharmony_ci.endif 234cabdff1aSopenharmony_ci // size >= 16 loads two qwords and increments x2, 235cabdff1aSopenharmony_ci // for size 4/8 it's enough with one qword and no 236cabdff1aSopenharmony_ci // postincrement 237cabdff1aSopenharmony_ci.if \size >= 16 238cabdff1aSopenharmony_ci sub x3, x3, x5 239cabdff1aSopenharmony_ci sub x3, x3, #8 240cabdff1aSopenharmony_ci.endif 241cabdff1aSopenharmony_ci // Load the filter vector 242cabdff1aSopenharmony_ci ld1 {v0.8h}, [x9] 243cabdff1aSopenharmony_ci1: 244cabdff1aSopenharmony_ci.if \size >= 16 245cabdff1aSopenharmony_ci mov x9, x5 246cabdff1aSopenharmony_ci.endif 247cabdff1aSopenharmony_ci // Load src 248cabdff1aSopenharmony_ci.if \size >= 16 249cabdff1aSopenharmony_ci ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24 250cabdff1aSopenharmony_ci ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24 251cabdff1aSopenharmony_ci.else 252cabdff1aSopenharmony_ci ld1 {v4.8b, v5.8b}, [x2] 253cabdff1aSopenharmony_ci ld1 {v16.8b, v17.8b}, [x7] 254cabdff1aSopenharmony_ci.endif 255cabdff1aSopenharmony_ci uxtl v4.8h, v4.8b 256cabdff1aSopenharmony_ci uxtl v5.8h, v5.8b 257cabdff1aSopenharmony_ci uxtl v16.8h, v16.8b 258cabdff1aSopenharmony_ci uxtl v17.8h, v17.8b 259cabdff1aSopenharmony_ci.if \size >= 16 260cabdff1aSopenharmony_ci uxtl v6.8h, v6.8b 261cabdff1aSopenharmony_ci uxtl v18.8h, v18.8b 262cabdff1aSopenharmony_ci.endif 263cabdff1aSopenharmony_ci2: 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci // Accumulate, adding idx2 last with a separate 266cabdff1aSopenharmony_ci // saturating add. The positive filter coefficients 267cabdff1aSopenharmony_ci // for all indices except idx2 must add up to less 268cabdff1aSopenharmony_ci // than 127 for this not to overflow. 269cabdff1aSopenharmony_ci mul v1.8h, v4.8h, v0.h[0] 270cabdff1aSopenharmony_ci mul v24.8h, v16.8h, v0.h[0] 271cabdff1aSopenharmony_ci.if \size >= 16 272cabdff1aSopenharmony_ci mul v2.8h, v5.8h, v0.h[0] 273cabdff1aSopenharmony_ci mul v25.8h, v17.8h, v0.h[0] 274cabdff1aSopenharmony_ci.endif 275cabdff1aSopenharmony_ci extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size 276cabdff1aSopenharmony_ci extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size 277cabdff1aSopenharmony_ci extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size 278cabdff1aSopenharmony_ci extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size 279cabdff1aSopenharmony_ci extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size 280cabdff1aSopenharmony_ci extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size 281cabdff1aSopenharmony_ci extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_ci // Round, shift and saturate 284cabdff1aSopenharmony_ci sqrshrun v1.8b, v1.8h, #7 285cabdff1aSopenharmony_ci sqrshrun v24.8b, v24.8h, #7 286cabdff1aSopenharmony_ci.if \size >= 16 287cabdff1aSopenharmony_ci sqrshrun2 v1.16b, v2.8h, #7 288cabdff1aSopenharmony_ci sqrshrun2 v24.16b, v25.8h, #7 289cabdff1aSopenharmony_ci.endif 290cabdff1aSopenharmony_ci // Average 291cabdff1aSopenharmony_ci.ifc \type,avg 292cabdff1aSopenharmony_ci.if \size >= 16 293cabdff1aSopenharmony_ci ld1 {v2.16b}, [x0] 294cabdff1aSopenharmony_ci ld1 {v3.16b}, [x6] 295cabdff1aSopenharmony_ci urhadd v1.16b, v1.16b, v2.16b 296cabdff1aSopenharmony_ci urhadd v24.16b, v24.16b, v3.16b 297cabdff1aSopenharmony_ci.elseif \size == 8 298cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0] 299cabdff1aSopenharmony_ci ld1 {v3.8b}, [x6] 300cabdff1aSopenharmony_ci urhadd v1.8b, v1.8b, v2.8b 301cabdff1aSopenharmony_ci urhadd v24.8b, v24.8b, v3.8b 302cabdff1aSopenharmony_ci.else 303cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x0] 304cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x6] 305cabdff1aSopenharmony_ci urhadd v1.8b, v1.8b, v2.8b 306cabdff1aSopenharmony_ci urhadd v24.8b, v24.8b, v3.8b 307cabdff1aSopenharmony_ci.endif 308cabdff1aSopenharmony_ci.endif 309cabdff1aSopenharmony_ci // Store and loop horizontally (for size >= 16) 310cabdff1aSopenharmony_ci.if \size >= 16 311cabdff1aSopenharmony_ci subs x9, x9, #16 312cabdff1aSopenharmony_ci st1 {v1.16b}, [x0], #16 313cabdff1aSopenharmony_ci st1 {v24.16b}, [x6], #16 314cabdff1aSopenharmony_ci b.eq 3f 315cabdff1aSopenharmony_ci mov v4.16b, v6.16b 316cabdff1aSopenharmony_ci mov v16.16b, v18.16b 317cabdff1aSopenharmony_ci ld1 {v6.16b}, [x2], #16 318cabdff1aSopenharmony_ci ld1 {v18.16b}, [x7], #16 319cabdff1aSopenharmony_ci uxtl v5.8h, v6.8b 320cabdff1aSopenharmony_ci uxtl2 v6.8h, v6.16b 321cabdff1aSopenharmony_ci uxtl v17.8h, v18.8b 322cabdff1aSopenharmony_ci uxtl2 v18.8h, v18.16b 323cabdff1aSopenharmony_ci b 2b 324cabdff1aSopenharmony_ci.elseif \size == 8 325cabdff1aSopenharmony_ci st1 {v1.8b}, [x0] 326cabdff1aSopenharmony_ci st1 {v24.8b}, [x6] 327cabdff1aSopenharmony_ci.else // \size == 4 328cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0] 329cabdff1aSopenharmony_ci st1 {v24.s}[0], [x6] 330cabdff1aSopenharmony_ci.endif 331cabdff1aSopenharmony_ci3: 332cabdff1aSopenharmony_ci // Loop vertically 333cabdff1aSopenharmony_ci add x0, x0, x1 334cabdff1aSopenharmony_ci add x6, x6, x1 335cabdff1aSopenharmony_ci add x2, x2, x3 336cabdff1aSopenharmony_ci add x7, x7, x3 337cabdff1aSopenharmony_ci subs w4, w4, #2 338cabdff1aSopenharmony_ci b.ne 1b 339cabdff1aSopenharmony_ci ret 340cabdff1aSopenharmony_ciendfunc 341cabdff1aSopenharmony_ci.endm 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci.macro do_8tap_h_size size 344cabdff1aSopenharmony_cido_8tap_h put, \size, 3, 4 345cabdff1aSopenharmony_cido_8tap_h avg, \size, 3, 4 346cabdff1aSopenharmony_cido_8tap_h put, \size, 4, 3 347cabdff1aSopenharmony_cido_8tap_h avg, \size, 4, 3 348cabdff1aSopenharmony_ci.endm 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_cido_8tap_h_size 4 351cabdff1aSopenharmony_cido_8tap_h_size 8 352cabdff1aSopenharmony_cido_8tap_h_size 16 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci.macro do_8tap_h_func type, filter, offset, size 355cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 356cabdff1aSopenharmony_ci movrel x6, X(ff_vp9_subpel_filters), 256*\offset 357cabdff1aSopenharmony_ci cmp w5, #8 358cabdff1aSopenharmony_ci add x9, x6, w5, uxtw #4 359cabdff1aSopenharmony_ci mov x5, #\size 360cabdff1aSopenharmony_ci.if \size >= 16 361cabdff1aSopenharmony_ci b.ge \type\()_8tap_16h_34 362cabdff1aSopenharmony_ci b \type\()_8tap_16h_43 363cabdff1aSopenharmony_ci.else 364cabdff1aSopenharmony_ci b.ge \type\()_8tap_\size\()h_34 365cabdff1aSopenharmony_ci b \type\()_8tap_\size\()h_43 366cabdff1aSopenharmony_ci.endif 367cabdff1aSopenharmony_ciendfunc 368cabdff1aSopenharmony_ci.endm 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci.macro do_8tap_h_filters size 371cabdff1aSopenharmony_cido_8tap_h_func put, regular, 1, \size 372cabdff1aSopenharmony_cido_8tap_h_func avg, regular, 1, \size 373cabdff1aSopenharmony_cido_8tap_h_func put, sharp, 2, \size 374cabdff1aSopenharmony_cido_8tap_h_func avg, sharp, 2, \size 375cabdff1aSopenharmony_cido_8tap_h_func put, smooth, 0, \size 376cabdff1aSopenharmony_cido_8tap_h_func avg, smooth, 0, \size 377cabdff1aSopenharmony_ci.endm 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_cido_8tap_h_filters 64 380cabdff1aSopenharmony_cido_8tap_h_filters 32 381cabdff1aSopenharmony_cido_8tap_h_filters 16 382cabdff1aSopenharmony_cido_8tap_h_filters 8 383cabdff1aSopenharmony_cido_8tap_h_filters 4 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci// Vertical filters 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ci// Round, shift and saturate and store reg1-reg2 over 4 lines 389cabdff1aSopenharmony_ci.macro do_store4 reg1, reg2, tmp1, tmp2, type 390cabdff1aSopenharmony_ci sqrshrun \reg1\().8b, \reg1\().8h, #7 391cabdff1aSopenharmony_ci sqrshrun \reg2\().8b, \reg2\().8h, #7 392cabdff1aSopenharmony_ci.ifc \type,avg 393cabdff1aSopenharmony_ci ld1 {\tmp1\().s}[0], [x7], x1 394cabdff1aSopenharmony_ci ld1 {\tmp2\().s}[0], [x7], x1 395cabdff1aSopenharmony_ci ld1 {\tmp1\().s}[1], [x7], x1 396cabdff1aSopenharmony_ci ld1 {\tmp2\().s}[1], [x7], x1 397cabdff1aSopenharmony_ci urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b 398cabdff1aSopenharmony_ci urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b 399cabdff1aSopenharmony_ci.endif 400cabdff1aSopenharmony_ci st1 {\reg1\().s}[0], [x0], x1 401cabdff1aSopenharmony_ci st1 {\reg2\().s}[0], [x0], x1 402cabdff1aSopenharmony_ci st1 {\reg1\().s}[1], [x0], x1 403cabdff1aSopenharmony_ci st1 {\reg2\().s}[1], [x0], x1 404cabdff1aSopenharmony_ci.endm 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci// Round, shift and saturate and store reg1-4 407cabdff1aSopenharmony_ci.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type 408cabdff1aSopenharmony_ci sqrshrun \reg1\().8b, \reg1\().8h, #7 409cabdff1aSopenharmony_ci sqrshrun \reg2\().8b, \reg2\().8h, #7 410cabdff1aSopenharmony_ci sqrshrun \reg3\().8b, \reg3\().8h, #7 411cabdff1aSopenharmony_ci sqrshrun \reg4\().8b, \reg4\().8h, #7 412cabdff1aSopenharmony_ci.ifc \type,avg 413cabdff1aSopenharmony_ci ld1 {\tmp1\().8b}, [x7], x1 414cabdff1aSopenharmony_ci ld1 {\tmp2\().8b}, [x7], x1 415cabdff1aSopenharmony_ci ld1 {\tmp3\().8b}, [x7], x1 416cabdff1aSopenharmony_ci ld1 {\tmp4\().8b}, [x7], x1 417cabdff1aSopenharmony_ci urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b 418cabdff1aSopenharmony_ci urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b 419cabdff1aSopenharmony_ci urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b 420cabdff1aSopenharmony_ci urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b 421cabdff1aSopenharmony_ci.endif 422cabdff1aSopenharmony_ci st1 {\reg1\().8b}, [x0], x1 423cabdff1aSopenharmony_ci st1 {\reg2\().8b}, [x0], x1 424cabdff1aSopenharmony_ci st1 {\reg3\().8b}, [x0], x1 425cabdff1aSopenharmony_ci st1 {\reg4\().8b}, [x0], x1 426cabdff1aSopenharmony_ci.endm 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 429cabdff1aSopenharmony_ci// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately 430cabdff1aSopenharmony_ci// at the end with saturation. Indices 0 and 7 always have negative or zero 431cabdff1aSopenharmony_ci// coefficients, so they can be accumulated into tmp1-tmp2 together with the 432cabdff1aSopenharmony_ci// largest coefficient. 433cabdff1aSopenharmony_ci.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 434cabdff1aSopenharmony_ci mul \dst1\().8h, \src2\().8h, v0.h[1] 435cabdff1aSopenharmony_ci mul \dst2\().8h, \src3\().8h, v0.h[1] 436cabdff1aSopenharmony_ci mul \tmp1\().8h, \src1\().8h, v0.h[0] 437cabdff1aSopenharmony_ci mul \tmp2\().8h, \src2\().8h, v0.h[0] 438cabdff1aSopenharmony_ci mla \dst1\().8h, \src3\().8h, v0.h[2] 439cabdff1aSopenharmony_ci mla \dst2\().8h, \src4\().8h, v0.h[2] 440cabdff1aSopenharmony_ci.if \idx1 == 3 441cabdff1aSopenharmony_ci mla \dst1\().8h, \src4\().8h, v0.h[3] 442cabdff1aSopenharmony_ci mla \dst2\().8h, \src5\().8h, v0.h[3] 443cabdff1aSopenharmony_ci.else 444cabdff1aSopenharmony_ci mla \dst1\().8h, \src5\().8h, v0.h[4] 445cabdff1aSopenharmony_ci mla \dst2\().8h, \src6\().8h, v0.h[4] 446cabdff1aSopenharmony_ci.endif 447cabdff1aSopenharmony_ci mla \dst1\().8h, \src6\().8h, v0.h[5] 448cabdff1aSopenharmony_ci mla \dst2\().8h, \src7\().8h, v0.h[5] 449cabdff1aSopenharmony_ci mla \tmp1\().8h, \src8\().8h, v0.h[7] 450cabdff1aSopenharmony_ci mla \tmp2\().8h, \src9\().8h, v0.h[7] 451cabdff1aSopenharmony_ci mla \dst1\().8h, \src7\().8h, v0.h[6] 452cabdff1aSopenharmony_ci mla \dst2\().8h, \src8\().8h, v0.h[6] 453cabdff1aSopenharmony_ci.if \idx2 == 3 454cabdff1aSopenharmony_ci mla \tmp1\().8h, \src4\().8h, v0.h[3] 455cabdff1aSopenharmony_ci mla \tmp2\().8h, \src5\().8h, v0.h[3] 456cabdff1aSopenharmony_ci.else 457cabdff1aSopenharmony_ci mla \tmp1\().8h, \src5\().8h, v0.h[4] 458cabdff1aSopenharmony_ci mla \tmp2\().8h, \src6\().8h, v0.h[4] 459cabdff1aSopenharmony_ci.endif 460cabdff1aSopenharmony_ci sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h 461cabdff1aSopenharmony_ci sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h 462cabdff1aSopenharmony_ci.endm 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci// Load pixels and extend them to 16 bit 465cabdff1aSopenharmony_ci.macro loadl dst1, dst2, dst3, dst4 466cabdff1aSopenharmony_ci ld1 {v1.8b}, [x2], x3 467cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 468cabdff1aSopenharmony_ci ld1 {v3.8b}, [x2], x3 469cabdff1aSopenharmony_ci.ifnb \dst4 470cabdff1aSopenharmony_ci ld1 {v4.8b}, [x2], x3 471cabdff1aSopenharmony_ci.endif 472cabdff1aSopenharmony_ci uxtl \dst1\().8h, v1.8b 473cabdff1aSopenharmony_ci uxtl \dst2\().8h, v2.8b 474cabdff1aSopenharmony_ci uxtl \dst3\().8h, v3.8b 475cabdff1aSopenharmony_ci.ifnb \dst4 476cabdff1aSopenharmony_ci uxtl \dst4\().8h, v4.8b 477cabdff1aSopenharmony_ci.endif 478cabdff1aSopenharmony_ci.endm 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci// Instantiate a vertical filter function for filtering 8 pixels at a time. 481cabdff1aSopenharmony_ci// The height is passed in x4, the width in x5 and the filter coefficients 482cabdff1aSopenharmony_ci// in x6. idx2 is the index of the largest filter coefficient (3 or 4) 483cabdff1aSopenharmony_ci// and idx1 is the other one of them. 484cabdff1aSopenharmony_ci.macro do_8tap_8v type, idx1, idx2 485cabdff1aSopenharmony_cifunction \type\()_8tap_8v_\idx1\idx2 486cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 487cabdff1aSopenharmony_ci sub x2, x2, x3 488cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 489cabdff1aSopenharmony_ci1: 490cabdff1aSopenharmony_ci.ifc \type,avg 491cabdff1aSopenharmony_ci mov x7, x0 492cabdff1aSopenharmony_ci.endif 493cabdff1aSopenharmony_ci mov x6, x4 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci loadl v17, v18, v19 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci loadl v20, v21, v22, v23 498cabdff1aSopenharmony_ci2: 499cabdff1aSopenharmony_ci loadl v24, v25, v26, v27 500cabdff1aSopenharmony_ci convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6 501cabdff1aSopenharmony_ci convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6 502cabdff1aSopenharmony_ci do_store v1, v2, v3, v4, v5, v6, v7, v28, \type 503cabdff1aSopenharmony_ci 504cabdff1aSopenharmony_ci subs x6, x6, #4 505cabdff1aSopenharmony_ci b.eq 8f 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci loadl v16, v17, v18, v19 508cabdff1aSopenharmony_ci convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6 509cabdff1aSopenharmony_ci convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6 510cabdff1aSopenharmony_ci do_store v1, v2, v3, v4, v5, v6, v7, v28, \type 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ci subs x6, x6, #4 513cabdff1aSopenharmony_ci b.eq 8f 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci loadl v20, v21, v22, v23 516cabdff1aSopenharmony_ci convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6 517cabdff1aSopenharmony_ci convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6 518cabdff1aSopenharmony_ci do_store v1, v2, v3, v4, v5, v6, v7, v28, \type 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci subs x6, x6, #4 521cabdff1aSopenharmony_ci b.ne 2b 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci8: 524cabdff1aSopenharmony_ci subs x5, x5, #8 525cabdff1aSopenharmony_ci b.eq 9f 526cabdff1aSopenharmony_ci // x0 -= h * dst_stride 527cabdff1aSopenharmony_ci msub x0, x1, x4, x0 528cabdff1aSopenharmony_ci // x2 -= h * src_stride 529cabdff1aSopenharmony_ci msub x2, x3, x4, x2 530cabdff1aSopenharmony_ci // x2 -= 8 * src_stride 531cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #3 532cabdff1aSopenharmony_ci // x2 += 1 * src_stride 533cabdff1aSopenharmony_ci add x2, x2, x3 534cabdff1aSopenharmony_ci add x2, x2, #8 535cabdff1aSopenharmony_ci add x0, x0, #8 536cabdff1aSopenharmony_ci b 1b 537cabdff1aSopenharmony_ci9: 538cabdff1aSopenharmony_ci ret 539cabdff1aSopenharmony_ciendfunc 540cabdff1aSopenharmony_ci.endm 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_cido_8tap_8v put, 3, 4 543cabdff1aSopenharmony_cido_8tap_8v put, 4, 3 544cabdff1aSopenharmony_cido_8tap_8v avg, 3, 4 545cabdff1aSopenharmony_cido_8tap_8v avg, 4, 3 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci 548cabdff1aSopenharmony_ci// Instantiate a vertical filter function for filtering a 4 pixels wide 549cabdff1aSopenharmony_ci// slice. The first half of the registers contain one row, while the second 550cabdff1aSopenharmony_ci// half of a register contains the second-next row (also stored in the first 551cabdff1aSopenharmony_ci// half of the register two steps ahead). The convolution does two outputs 552cabdff1aSopenharmony_ci// at a time; the output of v17-v24 into one, and v18-v25 into another one. 553cabdff1aSopenharmony_ci// The first half of first output is the first output row, the first half 554cabdff1aSopenharmony_ci// of the other output is the second output row. The second halves of the 555cabdff1aSopenharmony_ci// registers are rows 3 and 4. 556cabdff1aSopenharmony_ci// This only is designed to work for 4 or 8 output lines. 557cabdff1aSopenharmony_ci.macro do_8tap_4v type, idx1, idx2 558cabdff1aSopenharmony_cifunction \type\()_8tap_4v_\idx1\idx2 559cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 560cabdff1aSopenharmony_ci sub x2, x2, x3 561cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 562cabdff1aSopenharmony_ci.ifc \type,avg 563cabdff1aSopenharmony_ci mov x7, x0 564cabdff1aSopenharmony_ci.endif 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x2], x3 567cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2], x3 568cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x2], x3 569cabdff1aSopenharmony_ci ld1 {v4.s}[0], [x2], x3 570cabdff1aSopenharmony_ci ld1 {v5.s}[0], [x2], x3 571cabdff1aSopenharmony_ci ld1 {v6.s}[0], [x2], x3 572cabdff1aSopenharmony_ci trn1 v1.2s, v1.2s, v3.2s 573cabdff1aSopenharmony_ci ld1 {v7.s}[0], [x2], x3 574cabdff1aSopenharmony_ci trn1 v2.2s, v2.2s, v4.2s 575cabdff1aSopenharmony_ci ld1 {v26.s}[0], [x2], x3 576cabdff1aSopenharmony_ci uxtl v17.8h, v1.8b 577cabdff1aSopenharmony_ci trn1 v3.2s, v3.2s, v5.2s 578cabdff1aSopenharmony_ci ld1 {v27.s}[0], [x2], x3 579cabdff1aSopenharmony_ci uxtl v18.8h, v2.8b 580cabdff1aSopenharmony_ci trn1 v4.2s, v4.2s, v6.2s 581cabdff1aSopenharmony_ci ld1 {v28.s}[0], [x2], x3 582cabdff1aSopenharmony_ci uxtl v19.8h, v3.8b 583cabdff1aSopenharmony_ci trn1 v5.2s, v5.2s, v7.2s 584cabdff1aSopenharmony_ci ld1 {v29.s}[0], [x2], x3 585cabdff1aSopenharmony_ci uxtl v20.8h, v4.8b 586cabdff1aSopenharmony_ci trn1 v6.2s, v6.2s, v26.2s 587cabdff1aSopenharmony_ci uxtl v21.8h, v5.8b 588cabdff1aSopenharmony_ci trn1 v7.2s, v7.2s, v27.2s 589cabdff1aSopenharmony_ci uxtl v22.8h, v6.8b 590cabdff1aSopenharmony_ci trn1 v26.2s, v26.2s, v28.2s 591cabdff1aSopenharmony_ci uxtl v23.8h, v7.8b 592cabdff1aSopenharmony_ci trn1 v27.2s, v27.2s, v29.2s 593cabdff1aSopenharmony_ci uxtl v24.8h, v26.8b 594cabdff1aSopenharmony_ci uxtl v25.8h, v27.8b 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4 597cabdff1aSopenharmony_ci do_store4 v1, v2, v5, v6, \type 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_ci subs x4, x4, #4 600cabdff1aSopenharmony_ci b.eq 9f 601cabdff1aSopenharmony_ci 602cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x2], x3 603cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2], x3 604cabdff1aSopenharmony_ci trn1 v28.2s, v28.2s, v1.2s 605cabdff1aSopenharmony_ci trn1 v29.2s, v29.2s, v2.2s 606cabdff1aSopenharmony_ci ld1 {v1.s}[1], [x2], x3 607cabdff1aSopenharmony_ci uxtl v26.8h, v28.8b 608cabdff1aSopenharmony_ci ld1 {v2.s}[1], [x2], x3 609cabdff1aSopenharmony_ci uxtl v27.8h, v29.8b 610cabdff1aSopenharmony_ci uxtl v28.8h, v1.8b 611cabdff1aSopenharmony_ci uxtl v29.8h, v2.8b 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4 614cabdff1aSopenharmony_ci do_store4 v1, v2, v5, v6, \type 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci9: 617cabdff1aSopenharmony_ci ret 618cabdff1aSopenharmony_ciendfunc 619cabdff1aSopenharmony_ci.endm 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_cido_8tap_4v put, 3, 4 622cabdff1aSopenharmony_cido_8tap_4v put, 4, 3 623cabdff1aSopenharmony_cido_8tap_4v avg, 3, 4 624cabdff1aSopenharmony_cido_8tap_4v avg, 4, 3 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci.macro do_8tap_v_func type, filter, offset, size 628cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 629cabdff1aSopenharmony_ci uxtw x4, w4 630cabdff1aSopenharmony_ci movrel x5, X(ff_vp9_subpel_filters), 256*\offset 631cabdff1aSopenharmony_ci cmp w6, #8 632cabdff1aSopenharmony_ci add x6, x5, w6, uxtw #4 633cabdff1aSopenharmony_ci mov x5, #\size 634cabdff1aSopenharmony_ci.if \size >= 8 635cabdff1aSopenharmony_ci b.ge \type\()_8tap_8v_34 636cabdff1aSopenharmony_ci b \type\()_8tap_8v_43 637cabdff1aSopenharmony_ci.else 638cabdff1aSopenharmony_ci b.ge \type\()_8tap_4v_34 639cabdff1aSopenharmony_ci b \type\()_8tap_4v_43 640cabdff1aSopenharmony_ci.endif 641cabdff1aSopenharmony_ciendfunc 642cabdff1aSopenharmony_ci.endm 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci.macro do_8tap_v_filters size 645cabdff1aSopenharmony_cido_8tap_v_func put, regular, 1, \size 646cabdff1aSopenharmony_cido_8tap_v_func avg, regular, 1, \size 647cabdff1aSopenharmony_cido_8tap_v_func put, sharp, 2, \size 648cabdff1aSopenharmony_cido_8tap_v_func avg, sharp, 2, \size 649cabdff1aSopenharmony_cido_8tap_v_func put, smooth, 0, \size 650cabdff1aSopenharmony_cido_8tap_v_func avg, smooth, 0, \size 651cabdff1aSopenharmony_ci.endm 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_cido_8tap_v_filters 64 654cabdff1aSopenharmony_cido_8tap_v_filters 32 655cabdff1aSopenharmony_cido_8tap_v_filters 16 656cabdff1aSopenharmony_cido_8tap_v_filters 8 657cabdff1aSopenharmony_cido_8tap_v_filters 4 658