1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 24cabdff1aSopenharmony_ci vswp \r1, \r8 @ vtrn.64 \rq0, \rq4 25cabdff1aSopenharmony_ci vswp \r3, \r10 @ vtrn.64 \rq1, \rq5 26cabdff1aSopenharmony_ci vswp \r5, \r12 @ vtrn.64 \rq2, \rq6 27cabdff1aSopenharmony_ci vswp \r7, \r14 @ vtrn.64 \rq3, \rq7 28cabdff1aSopenharmony_ci vtrn.32 \rq0, \rq2 29cabdff1aSopenharmony_ci vtrn.32 \rq1, \rq3 30cabdff1aSopenharmony_ci vtrn.32 \rq4, \rq6 31cabdff1aSopenharmony_ci vtrn.32 \rq5, \rq7 32cabdff1aSopenharmony_ci vtrn.16 \rq0, \rq1 33cabdff1aSopenharmony_ci vtrn.16 \rq2, \rq3 34cabdff1aSopenharmony_ci vtrn.16 \rq4, \rq5 35cabdff1aSopenharmony_ci vtrn.16 \rq6, \rq7 36cabdff1aSopenharmony_ci.endm 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci.macro transpose16_4x4 r0, r1, r2, r3 39cabdff1aSopenharmony_ci vtrn.32 \r0, \r2 40cabdff1aSopenharmony_ci vtrn.32 \r1, \r3 41cabdff1aSopenharmony_ci vtrn.16 \r0, \r1 42cabdff1aSopenharmony_ci vtrn.16 \r2, \r3 43cabdff1aSopenharmony_ci.endm 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ci@ Do a 4x4 transpose, using q registers for the subtransposes that don't 46cabdff1aSopenharmony_ci@ need to address the indiviudal d registers. 47cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1 48cabdff1aSopenharmony_ci.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3 49cabdff1aSopenharmony_ci vtrn.32 \rq0, \rq1 50cabdff1aSopenharmony_ci vtrn.16 \r0, \r1 51cabdff1aSopenharmony_ci vtrn.16 \r2, \r3 52cabdff1aSopenharmony_ci.endm 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers q8-q15, 55cabdff1aSopenharmony_ci@ and q0-q7 are used as scratch registers. 56cabdff1aSopenharmony_ci@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 57cabdff1aSopenharmony_ci.macro loop_filter_q wd 58cabdff1aSopenharmony_ci vdup.u16 q0, r2 @ E 59cabdff1aSopenharmony_ci vdup.u16 q1, r3 @ I 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci vabd.u16 q2, q8, q9 @ abs(p3 - p2) 62cabdff1aSopenharmony_ci vabd.u16 q3, q9, q10 @ abs(p2 - p1) 63cabdff1aSopenharmony_ci vabd.u16 q4, q10, q11 @ abs(p1 - p0) 64cabdff1aSopenharmony_ci vabd.u16 q5, q12, q13 @ abs(q0 - q1) 65cabdff1aSopenharmony_ci vabd.u16 q6, q13, q14 @ abs(q1 - q2) 66cabdff1aSopenharmony_ci vabd.u16 q7, q14, q15 @ abs(q2 - q3) 67cabdff1aSopenharmony_ci vmax.u16 q2, q2, q3 68cabdff1aSopenharmony_ci vmax.u16 q3, q4, q5 69cabdff1aSopenharmony_ci vmax.u16 q4, q6, q7 70cabdff1aSopenharmony_ci vabd.u16 q5, q11, q12 @ abs(p0 - q0) 71cabdff1aSopenharmony_ci vmax.u16 q2, q2, q3 72cabdff1aSopenharmony_ci vadd.u16 q5, q5, q5 @ abs(p0 - q0) * 2 73cabdff1aSopenharmony_ci vabd.u16 q6, q10, q13 @ abs(p1 - q1) 74cabdff1aSopenharmony_ci vmax.u16 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 75cabdff1aSopenharmony_ci vshr.u16 q6, q6, #1 76cabdff1aSopenharmony_ci vcle.u16 q2, q2, q1 @ max(abs()) <= I 77cabdff1aSopenharmony_ci vadd.u16 q5, q5, q6 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 78cabdff1aSopenharmony_ci vcle.u16 q5, q5, q0 79cabdff1aSopenharmony_ci vand q2, q2, q5 @ fm 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci vmovn.u16 d10, q2 82cabdff1aSopenharmony_ci vmov r8, r9, d10 83cabdff1aSopenharmony_ci orrs r8, r8, r9 84cabdff1aSopenharmony_ci @ If no pixels need filtering, just exit as soon as possible 85cabdff1aSopenharmony_ci beq 9f 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci.if \wd >= 8 88cabdff1aSopenharmony_ci vdup.u16 q0, r5 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci vabd.u16 q1, q8, q11 @ abs(p3 - p0) 91cabdff1aSopenharmony_ci vabd.u16 q3, q9, q11 @ abs(p2 - p0) 92cabdff1aSopenharmony_ci vabd.u16 q4, q10, q11 @ abs(p1 - p0) 93cabdff1aSopenharmony_ci vabd.u16 q5, q13, q12 @ abs(q1 - q0) 94cabdff1aSopenharmony_ci vabd.u16 q6, q14, q12 @ abs(q2 - q0) 95cabdff1aSopenharmony_ci vabd.u16 q7, q15, q12 @ abs(q3 - q0) 96cabdff1aSopenharmony_ci vmax.u16 q1, q1, q3 97cabdff1aSopenharmony_ci vmax.u16 q4, q4, q5 98cabdff1aSopenharmony_ci vmax.u16 q6, q6, q7 99cabdff1aSopenharmony_ci @ The rest of the calculation of flat8in is interleaved below 100cabdff1aSopenharmony_ci.endif 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci @ Calculate the normal inner loop filter for 2 or 4 pixels 103cabdff1aSopenharmony_ci vabd.u16 q3, q10, q11 @ abs(p1 - p0) 104cabdff1aSopenharmony_ci.if \wd == 8 105cabdff1aSopenharmony_ci vmax.u16 q1, q1, q4 106cabdff1aSopenharmony_ci.endif 107cabdff1aSopenharmony_ci vabd.u16 q4, q13, q12 @ abs(q1 - q0) 108cabdff1aSopenharmony_ci.if \wd == 8 109cabdff1aSopenharmony_ci vmax.u16 q1, q1, q6 110cabdff1aSopenharmony_ci.endif 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci vsub.u16 q5, q10, q13 @ p1 - q1 113cabdff1aSopenharmony_ci vmax.u16 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) 114cabdff1aSopenharmony_ci vdup.u16 q4, r4 @ H 115cabdff1aSopenharmony_ci vsub.u16 q6, q12, q11 @ q0 - p0 116cabdff1aSopenharmony_ci.if \wd == 8 117cabdff1aSopenharmony_ci vcle.u16 q1, q1, q0 @ flat8in 118cabdff1aSopenharmony_ci.endif 119cabdff1aSopenharmony_ci vdup.u16 q0, r6 @ left shift for saturation 120cabdff1aSopenharmony_ci vcle.u16 q3, q3, q4 @ !hev 121cabdff1aSopenharmony_ci.if \wd == 8 122cabdff1aSopenharmony_ci vand q1, q1, q2 @ flat8in && fm 123cabdff1aSopenharmony_ci.endif 124cabdff1aSopenharmony_ci vneg.s16 q4, q0 @ negative left shift after saturation 125cabdff1aSopenharmony_ci vqshl.s16 q5, q5, q0 126cabdff1aSopenharmony_ci.if \wd == 8 127cabdff1aSopenharmony_ci vbic q2, q2, q1 @ fm && !flat8in 128cabdff1aSopenharmony_ci.endif 129cabdff1aSopenharmony_ci vmov.s16 q7, #3 130cabdff1aSopenharmony_ci vand q3, q3, q2 @ !hev && fm && !flat8in 131cabdff1aSopenharmony_ci vshl.s16 q5, q5, q4 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci vmul.s16 q6, q6, q7 @ 3 * (q0 - p0) 134cabdff1aSopenharmony_ci vbic q5, q5, q3 @ if (!hev) av_clip_int2p = 0 135cabdff1aSopenharmony_ci vadd.s16 q6, q6, q5 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)] 136cabdff1aSopenharmony_ci vmov.s16 q5, #4 137cabdff1aSopenharmony_ci vqshl.s16 q6, q6, q0 138cabdff1aSopenharmony_ci vmov.s16 q0, #3 139cabdff1aSopenharmony_ci vshl.s16 q6, q6, q4 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 140cabdff1aSopenharmony_ci vdup.u16 q4, r7 @ max pixel value 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci vshr.u16 q4, q4, #1 @ (1 << (BIT_DEPTH - 1)) - 1) 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci vadd.s16 q5, q6, q5 @ f + 4 145cabdff1aSopenharmony_ci vadd.s16 q0, q6, q0 @ f + 3 146cabdff1aSopenharmony_ci vmov.s16 q6, #0 147cabdff1aSopenharmony_ci vmin.s16 q5, q5, q4 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 148cabdff1aSopenharmony_ci vmin.s16 q0, q0, q4 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 149cabdff1aSopenharmony_ci vdup.u16 q4, r7 @ max pixel value 150cabdff1aSopenharmony_ci vshr.s16 q5, q5, #3 @ f1 151cabdff1aSopenharmony_ci vshr.s16 q0, q0, #3 @ f2 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci vadd.s16 q0, q11, q0 @ p0 + f2 154cabdff1aSopenharmony_ci vsub.s16 q7, q12, q5 @ q0 - f1 155cabdff1aSopenharmony_ci vmin.s16 q0, q0, q4 156cabdff1aSopenharmony_ci vmin.s16 q7, q7, q4 157cabdff1aSopenharmony_ci vrshr.s16 q5, q5, #1 @ f = (f1 + 1) >> 1 158cabdff1aSopenharmony_ci vmax.s16 q0, q0, q6 @ out p0 159cabdff1aSopenharmony_ci vmax.s16 q7, q7, q6 @ out q0 160cabdff1aSopenharmony_ci vbit q11, q0, q2 @ if (fm && !flat8in) 161cabdff1aSopenharmony_ci vbit q12, q7, q2 162cabdff1aSopenharmony_ci.if \wd >= 8 163cabdff1aSopenharmony_ci vmovn.u16 d4, q1 164cabdff1aSopenharmony_ci.endif 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci vadd.s16 q0, q10, q5 @ p1 + f 167cabdff1aSopenharmony_ci vsub.s16 q7, q13, q5 @ q1 - f 168cabdff1aSopenharmony_ci.if \wd >= 8 169cabdff1aSopenharmony_ci vmov r8, r9, d4 170cabdff1aSopenharmony_ci.endif 171cabdff1aSopenharmony_ci vmin.s16 q0, q0, q4 172cabdff1aSopenharmony_ci vmin.s16 q7, q7, q4 173cabdff1aSopenharmony_ci.if \wd >= 8 174cabdff1aSopenharmony_ci orrs r8, r8, r9 175cabdff1aSopenharmony_ci.endif 176cabdff1aSopenharmony_ci vmax.s16 q0, q0, q6 @ out p1 177cabdff1aSopenharmony_ci vmax.s16 q7, q7, q6 @ out q1 178cabdff1aSopenharmony_ci vbit q10, q0, q3 @ if (!hev && fm && !flat8in) 179cabdff1aSopenharmony_ci vbit q13, q7, q3 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci.if \wd >= 8 182cabdff1aSopenharmony_ci @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels 183cabdff1aSopenharmony_ci beq 6f 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_ci @ flat8in 186cabdff1aSopenharmony_ci vadd.u16 q2, q8, q9 187cabdff1aSopenharmony_ci vadd.u16 q3, q10, q13 188cabdff1aSopenharmony_ci vadd.u16 q4, q8, q10 189cabdff1aSopenharmony_ci vadd.u16 q5, q11, q14 190cabdff1aSopenharmony_ci vadd.u16 q0, q2, q2 191cabdff1aSopenharmony_ci vadd.u16 q0, q0, q11 192cabdff1aSopenharmony_ci vadd.u16 q0, q0, q12 193cabdff1aSopenharmony_ci vadd.u16 q0, q0, q4 194cabdff1aSopenharmony_ci vsub.s16 q3, q3, q2 195cabdff1aSopenharmony_ci vsub.s16 q5, q5, q4 196cabdff1aSopenharmony_ci vrshr.u16 q6, q0, #3 @ out p2 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci vadd.u16 q0, q0, q3 199cabdff1aSopenharmony_ci vadd.u16 q2, q8, q11 200cabdff1aSopenharmony_ci vadd.u16 q3, q12, q15 201cabdff1aSopenharmony_ci vrshr.u16 q7, q0, #3 @ out p1 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci vadd.u16 q0, q0, q5 204cabdff1aSopenharmony_ci vsub.s16 q3, q3, q2 205cabdff1aSopenharmony_ci vadd.u16 q4, q9, q12 206cabdff1aSopenharmony_ci vbit q9, q6, q1 207cabdff1aSopenharmony_ci vadd.u16 q5, q13, q15 208cabdff1aSopenharmony_ci vrshr.u16 q6, q0, #3 @ out p0 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_ci vadd.u16 q0, q0, q3 211cabdff1aSopenharmony_ci vsub.s16 q5, q5, q4 212cabdff1aSopenharmony_ci vadd.u16 q2, q10, q13 213cabdff1aSopenharmony_ci vbit q10, q7, q1 214cabdff1aSopenharmony_ci vadd.u16 q3, q14, q15 215cabdff1aSopenharmony_ci vrshr.u16 q7, q0, #3 @ out q0 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci vadd.u16 q0, q0, q5 218cabdff1aSopenharmony_ci vsub.s16 q3, q3, q2 219cabdff1aSopenharmony_ci vbit q11, q6, q1 220cabdff1aSopenharmony_ci vrshr.u16 q6, q0, #3 @ out q1 221cabdff1aSopenharmony_ci 222cabdff1aSopenharmony_ci vadd.u16 q0, q0, q3 223cabdff1aSopenharmony_ci vbit q12, q7, q1 224cabdff1aSopenharmony_ci vrshr.u16 q7, q0, #3 @ out q2 225cabdff1aSopenharmony_ci vbit q13, q6, q1 226cabdff1aSopenharmony_ci vbit q14, q7, q1 227cabdff1aSopenharmony_ci.endif 228cabdff1aSopenharmony_ci.endm 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers d16-d31, 231cabdff1aSopenharmony_ci@ and d0-d7 are used as scratch registers. 232cabdff1aSopenharmony_ci@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31 233cabdff1aSopenharmony_ci@ Depending on the width of the loop filter, we either use d16-d19 234cabdff1aSopenharmony_ci@ and d28-d31 as temp registers, or d8-d15. 235cabdff1aSopenharmony_ci@ In practice, this is only ever instantiated once, so the macro parameters 236cabdff1aSopenharmony_ci@ could be hardcoded, but keeping them as is, to keep similarities to the 237cabdff1aSopenharmony_ci@ 8 bpp and aarch64 versions. 238cabdff1aSopenharmony_ci.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 239cabdff1aSopenharmony_ci vdup.u16 d0, r2 @ E 240cabdff1aSopenharmony_ci vdup.u16 d2, r3 @ I 241cabdff1aSopenharmony_ci 242cabdff1aSopenharmony_ci vabd.u16 d4, d20, d21 @ abs(p3 - p2) 243cabdff1aSopenharmony_ci vabd.u16 d5, d21, d22 @ abs(p2 - p1) 244cabdff1aSopenharmony_ci vabd.u16 d6, d22, d23 @ abs(p1 - p0) 245cabdff1aSopenharmony_ci vabd.u16 d7, d24, d25 @ abs(q0 - q1) 246cabdff1aSopenharmony_ci vabd.u16 \tmp1, d25, d26 @ abs(q1 - q2) 247cabdff1aSopenharmony_ci vabd.u16 \tmp2, d26, d27 @ abs(q2 - q3) 248cabdff1aSopenharmony_ci vmax.u16 d4, d4, d5 249cabdff1aSopenharmony_ci vmax.u16 d5, d6, d7 250cabdff1aSopenharmony_ci vmax.u16 \tmp1, \tmp1, \tmp2 251cabdff1aSopenharmony_ci vabd.u16 d6, d23, d24 @ abs(p0 - q0) 252cabdff1aSopenharmony_ci vmax.u16 d4, d4, d5 253cabdff1aSopenharmony_ci vadd.u16 d6, d6, d6 @ abs(p0 - q0) * 2 254cabdff1aSopenharmony_ci vabd.u16 d5, d22, d25 @ abs(p1 - q1) 255cabdff1aSopenharmony_ci vmax.u16 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 256cabdff1aSopenharmony_ci vshr.u16 d5, d5, #1 257cabdff1aSopenharmony_ci vcle.u16 d4, d4, d2 @ max(abs()) <= I 258cabdff1aSopenharmony_ci vadd.u16 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 259cabdff1aSopenharmony_ci vcle.u16 d6, d6, d0 260cabdff1aSopenharmony_ci vand d4, d4, d6 @ fm 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci vdup.u16 d3, r4 @ H 263cabdff1aSopenharmony_ci vmov r8, r9, d4 264cabdff1aSopenharmony_ci orrs r8, r8, r9 265cabdff1aSopenharmony_ci @ If no pixels need filtering, just exit as soon as possible 266cabdff1aSopenharmony_ci beq 9f 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci.if \wd >= 8 269cabdff1aSopenharmony_ci vdup.u16 d0, r5 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_ci vabd.u16 d6, d20, d23 @ abs(p3 - p0) 272cabdff1aSopenharmony_ci vabd.u16 d2, d21, d23 @ abs(p2 - p0) 273cabdff1aSopenharmony_ci vabd.u16 d1, d22, d23 @ abs(p1 - p0) 274cabdff1aSopenharmony_ci vabd.u16 \tmp1, d25, d24 @ abs(q1 - q0) 275cabdff1aSopenharmony_ci vabd.u16 \tmp2, d26, d24 @ abs(q2 - q0) 276cabdff1aSopenharmony_ci vabd.u16 \tmp3, d27, d24 @ abs(q3 - q0) 277cabdff1aSopenharmony_ci vmax.u16 d6, d6, d2 278cabdff1aSopenharmony_ci vmax.u16 d1, d1, \tmp1 279cabdff1aSopenharmony_ci vmax.u16 \tmp2, \tmp2, \tmp3 280cabdff1aSopenharmony_ci.if \wd == 16 281cabdff1aSopenharmony_ci vabd.u16 d7, d16, d23 @ abs(p7 - p0) 282cabdff1aSopenharmony_ci vmax.u16 d6, d6, d1 283cabdff1aSopenharmony_ci vabd.u16 d2, d17, d23 @ abs(p6 - p0) 284cabdff1aSopenharmony_ci vmax.u16 d6, d6, \tmp2 285cabdff1aSopenharmony_ci vabd.u16 d1, d18, d23 @ abs(p5 - p0) 286cabdff1aSopenharmony_ci vcle.u16 d6, d6, d0 @ flat8in 287cabdff1aSopenharmony_ci vabd.u16 d8, d19, d23 @ abs(p4 - p0) 288cabdff1aSopenharmony_ci vand d6, d6, d4 @ flat8in && fm 289cabdff1aSopenharmony_ci vabd.u16 d9, d28, d24 @ abs(q4 - q0) 290cabdff1aSopenharmony_ci vbic d4, d4, d6 @ fm && !flat8in 291cabdff1aSopenharmony_ci vabd.u16 d10, d29, d24 @ abs(q5 - q0) 292cabdff1aSopenharmony_ci vabd.u16 d11, d30, d24 @ abs(q6 - q0) 293cabdff1aSopenharmony_ci vabd.u16 d12, d31, d24 @ abs(q7 - q0) 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci vmax.u16 d7, d7, d2 296cabdff1aSopenharmony_ci vmax.u16 d1, d1, d8 297cabdff1aSopenharmony_ci vmax.u16 d9, d9, d10 298cabdff1aSopenharmony_ci vmax.u16 d11, d11, d12 299cabdff1aSopenharmony_ci @ The rest of the calculation of flat8out is interleaved below 300cabdff1aSopenharmony_ci.else 301cabdff1aSopenharmony_ci @ The rest of the calculation of flat8in is interleaved below 302cabdff1aSopenharmony_ci.endif 303cabdff1aSopenharmony_ci.endif 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ci @ Calculate the normal inner loop filter for 2 or 4 pixels 306cabdff1aSopenharmony_ci vabd.u16 d5, d22, d23 @ abs(p1 - p0) 307cabdff1aSopenharmony_ci.if \wd == 16 308cabdff1aSopenharmony_ci vmax.u16 d7, d7, d1 309cabdff1aSopenharmony_ci vmax.u16 d9, d9, d11 310cabdff1aSopenharmony_ci.elseif \wd == 8 311cabdff1aSopenharmony_ci vmax.u16 d6, d6, d1 312cabdff1aSopenharmony_ci.endif 313cabdff1aSopenharmony_ci vabd.u16 d1, d25, d24 @ abs(q1 - q0) 314cabdff1aSopenharmony_ci.if \wd == 16 315cabdff1aSopenharmony_ci vmax.u16 d7, d7, d9 316cabdff1aSopenharmony_ci.elseif \wd == 8 317cabdff1aSopenharmony_ci vmax.u16 d6, d6, \tmp2 318cabdff1aSopenharmony_ci.endif 319cabdff1aSopenharmony_ci vdup.u16 \tmp2, r6 @ left shift for saturation 320cabdff1aSopenharmony_ci vsub.u16 \tmp1, d22, d25 @ p1 - q1 321cabdff1aSopenharmony_ci vneg.s16 \tmp6, \tmp2 @ negative left shift after saturation 322cabdff1aSopenharmony_ci vmax.u16 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0)) 323cabdff1aSopenharmony_ci vsub.u16 \tmp3, d24, d23 @ q0 - p0 324cabdff1aSopenharmony_ci vmov.s16 \tmp5, #3 325cabdff1aSopenharmony_ci.if \wd == 8 326cabdff1aSopenharmony_ci vcle.u16 d6, d6, d0 @ flat8in 327cabdff1aSopenharmony_ci.endif 328cabdff1aSopenharmony_ci vcle.u16 d5, d5, d3 @ !hev 329cabdff1aSopenharmony_ci.if \wd == 8 330cabdff1aSopenharmony_ci vand d6, d6, d4 @ flat8in && fm 331cabdff1aSopenharmony_ci.endif 332cabdff1aSopenharmony_ci vqshl.s16 \tmp1, \tmp1, \tmp2 333cabdff1aSopenharmony_ci.if \wd == 16 334cabdff1aSopenharmony_ci vcle.u16 d7, d7, d0 @ flat8out 335cabdff1aSopenharmony_ci.elseif \wd == 8 336cabdff1aSopenharmony_ci vbic d4, d4, d6 @ fm && !flat8in 337cabdff1aSopenharmony_ci.endif 338cabdff1aSopenharmony_ci vand d5, d5, d4 @ !hev && fm && !flat8in 339cabdff1aSopenharmony_ci.if \wd == 16 340cabdff1aSopenharmony_ci vand d7, d7, d6 @ flat8out && flat8in && fm 341cabdff1aSopenharmony_ci.endif 342cabdff1aSopenharmony_ci vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci vmul.s16 \tmp3, \tmp3, \tmp5 @ 3 * (q0 - p0) 345cabdff1aSopenharmony_ci vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int2p = 0 346cabdff1aSopenharmony_ci vmov.s16 d2, #4 347cabdff1aSopenharmony_ci vadd.s16 \tmp3, \tmp3, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)] 348cabdff1aSopenharmony_ci vmov.s16 d3, #3 349cabdff1aSopenharmony_ci vqshl.s16 \tmp1, \tmp3, \tmp2 350cabdff1aSopenharmony_ci vmov.s16 \tmp5, #0 351cabdff1aSopenharmony_ci vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 352cabdff1aSopenharmony_ci vdup.u16 \tmp6, r7 @ max pixel value 353cabdff1aSopenharmony_ci.if \wd == 16 354cabdff1aSopenharmony_ci vbic d6, d6, d7 @ fm && flat8in && !flat8out 355cabdff1aSopenharmony_ci.endif 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci vshr.u16 \tmp2, \tmp6, #1 @ (1 << (BIT_DEPTH - 1)) - 1 358cabdff1aSopenharmony_ci 359cabdff1aSopenharmony_ci vadd.s16 \tmp3, \tmp1, d2 @ f + 4 360cabdff1aSopenharmony_ci vadd.s16 \tmp4, \tmp1, d3 @ f + 3 361cabdff1aSopenharmony_ci vmin.s16 \tmp3, \tmp3, \tmp2 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 362cabdff1aSopenharmony_ci vmin.s16 \tmp4, \tmp4, \tmp2 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 363cabdff1aSopenharmony_ci vshr.s16 \tmp3, \tmp3, #3 @ f1 364cabdff1aSopenharmony_ci vshr.s16 \tmp4, \tmp4, #3 @ f2 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci vadd.s16 d0, d23, \tmp4 @ p0 + f2 367cabdff1aSopenharmony_ci vsub.s16 d2, d24, \tmp3 @ q0 - f1 368cabdff1aSopenharmony_ci vmin.s16 d0, d0, \tmp6 369cabdff1aSopenharmony_ci vmin.s16 d2, d2, \tmp6 370cabdff1aSopenharmony_ci vrshr.s16 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1 371cabdff1aSopenharmony_ci vmax.s16 d0, d0, \tmp5 @ out p0 372cabdff1aSopenharmony_ci vmax.s16 d2, d2, \tmp5 @ out q0 373cabdff1aSopenharmony_ci vbit d23, d0, d4 @ if (fm && !flat8in) 374cabdff1aSopenharmony_ci vbit d24, d2, d4 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci vadd.s16 d0, d22, \tmp3 @ p1 + f 377cabdff1aSopenharmony_ci vsub.s16 d2, d25, \tmp3 @ q1 - f 378cabdff1aSopenharmony_ci.if \wd >= 8 379cabdff1aSopenharmony_ci vmov r8, r9, d6 380cabdff1aSopenharmony_ci.endif 381cabdff1aSopenharmony_ci vmin.s16 d0, d0, \tmp6 382cabdff1aSopenharmony_ci vmin.s16 d2, d2, \tmp6 383cabdff1aSopenharmony_ci.if \wd >= 8 384cabdff1aSopenharmony_ci orrs r8, r8, r9 385cabdff1aSopenharmony_ci.endif 386cabdff1aSopenharmony_ci vmax.s16 d0, d0, \tmp5 @ out p1 387cabdff1aSopenharmony_ci vmax.s16 d2, d2, \tmp5 @ out q1 388cabdff1aSopenharmony_ci vbit d22, d0, d5 @ if (!hev && fm && !flat8in) 389cabdff1aSopenharmony_ci vbit d25, d2, d5 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci.if \wd >= 8 392cabdff1aSopenharmony_ci @ If no pixels need flat8in, jump to flat8out 393cabdff1aSopenharmony_ci @ (or to a writeout of the inner 4 pixels, for wd=8) 394cabdff1aSopenharmony_ci beq 6f 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_ci @ flat8in 397cabdff1aSopenharmony_ci vadd.u16 \tmp1, d20, d21 398cabdff1aSopenharmony_ci vadd.u16 \tmp3, d22, d25 399cabdff1aSopenharmony_ci vadd.u16 \tmp5, d20, d22 400cabdff1aSopenharmony_ci vadd.u16 \tmp7, d23, d26 401cabdff1aSopenharmony_ci vadd.u16 d0, \tmp1, \tmp1 402cabdff1aSopenharmony_ci vadd.u16 d0, d0, d23 403cabdff1aSopenharmony_ci vadd.u16 d0, d0, d24 404cabdff1aSopenharmony_ci vadd.u16 d0, d0, \tmp5 405cabdff1aSopenharmony_ci vsub.s16 \tmp3, \tmp3, \tmp1 406cabdff1aSopenharmony_ci vsub.s16 \tmp7, \tmp7, \tmp5 407cabdff1aSopenharmony_ci vrshr.u16 d2, d0, #3 @ out p2 408cabdff1aSopenharmony_ci 409cabdff1aSopenharmony_ci vadd.u16 d0, d0, \tmp3 410cabdff1aSopenharmony_ci vadd.u16 \tmp1, d20, d23 411cabdff1aSopenharmony_ci vadd.u16 \tmp3, d24, d27 412cabdff1aSopenharmony_ci vrshr.u16 d3, d0, #3 @ out p1 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci vadd.u16 d0, d0, \tmp7 415cabdff1aSopenharmony_ci vsub.s16 \tmp3, \tmp3, \tmp1 416cabdff1aSopenharmony_ci vadd.u16 \tmp5, d21, d24 417cabdff1aSopenharmony_ci vadd.u16 \tmp7, d25, d27 418cabdff1aSopenharmony_ci vrshr.u16 d4, d0, #3 @ out p0 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci vadd.u16 d0, d0, \tmp3 421cabdff1aSopenharmony_ci vsub.s16 \tmp7, \tmp7, \tmp5 422cabdff1aSopenharmony_ci vadd.u16 \tmp1, d22, d25 423cabdff1aSopenharmony_ci vadd.u16 \tmp3, d26, d27 424cabdff1aSopenharmony_ci vrshr.u16 d5, d0, #3 @ out d0 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci vadd.u16 d0, d0, \tmp7 427cabdff1aSopenharmony_ci vsub.s16 \tmp3, \tmp3, \tmp1 428cabdff1aSopenharmony_ci vrshr.u16 \tmp5, d0, #3 @ out q1 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_ci vadd.u16 d0, d0, \tmp3 431cabdff1aSopenharmony_ci @ The output here is written back into the input registers. This doesn't 432cabdff1aSopenharmony_ci @ matter for the flat8out part below, since we only update those pixels 433cabdff1aSopenharmony_ci @ which won't be touched below. 434cabdff1aSopenharmony_ci vbit d21, d2, d6 435cabdff1aSopenharmony_ci vbit d22, d3, d6 436cabdff1aSopenharmony_ci vbit d23, d4, d6 437cabdff1aSopenharmony_ci vrshr.u16 \tmp6, d0, #3 @ out q2 438cabdff1aSopenharmony_ci vbit d24, d5, d6 439cabdff1aSopenharmony_ci vbit d25, \tmp5, d6 440cabdff1aSopenharmony_ci vbit d26, \tmp6, d6 441cabdff1aSopenharmony_ci.endif 442cabdff1aSopenharmony_ci.if \wd == 16 443cabdff1aSopenharmony_ci6: 444cabdff1aSopenharmony_ci vorr d2, d6, d7 445cabdff1aSopenharmony_ci vmov r8, r9, d2 446cabdff1aSopenharmony_ci orrs r8, r8, r9 447cabdff1aSopenharmony_ci @ If no pixels needed flat8in nor flat8out, jump to a 448cabdff1aSopenharmony_ci @ writeout of the inner 4 pixels 449cabdff1aSopenharmony_ci beq 7f 450cabdff1aSopenharmony_ci vmov r8, r9, d7 451cabdff1aSopenharmony_ci orrs r8, r8, r9 452cabdff1aSopenharmony_ci @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels 453cabdff1aSopenharmony_ci beq 8f 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ci @ flat8out 456cabdff1aSopenharmony_ci @ This writes all outputs into d2-d17 (skipping d6 and d16). 457cabdff1aSopenharmony_ci @ If this part is skipped, the output is read from d21-d26 (which is the input 458cabdff1aSopenharmony_ci @ to this section). 459cabdff1aSopenharmony_ci vshl.u16 d0, d16, #3 @ 8 * d16 460cabdff1aSopenharmony_ci vsub.u16 d0, d0, d16 @ 7 * d16 461cabdff1aSopenharmony_ci vadd.u16 d0, d0, d17 462cabdff1aSopenharmony_ci vadd.u16 d8, d17, d18 463cabdff1aSopenharmony_ci vadd.u16 d10, d19, d20 464cabdff1aSopenharmony_ci vadd.s16 d0, d0, d8 465cabdff1aSopenharmony_ci vadd.u16 d8, d16, d17 466cabdff1aSopenharmony_ci vadd.u16 d12, d21, d22 467cabdff1aSopenharmony_ci vadd.s16 d0, d0, d10 468cabdff1aSopenharmony_ci vadd.u16 d10, d18, d25 469cabdff1aSopenharmony_ci vadd.u16 d14, d23, d24 470cabdff1aSopenharmony_ci vsub.s16 d10, d10, d8 471cabdff1aSopenharmony_ci vadd.s16 d0, d0, d12 472cabdff1aSopenharmony_ci vadd.s16 d0, d0, d14 473cabdff1aSopenharmony_ci vadd.u16 d12, d16, d18 474cabdff1aSopenharmony_ci vadd.u16 d14, d19, d26 475cabdff1aSopenharmony_ci vrshr.u16 d2, d0, #4 476cabdff1aSopenharmony_ci 477cabdff1aSopenharmony_ci vadd.s16 d0, d0, d10 478cabdff1aSopenharmony_ci vadd.u16 d8, d16, d19 479cabdff1aSopenharmony_ci vadd.u16 d10, d20, d27 480cabdff1aSopenharmony_ci vsub.s16 d14, d14, d12 481cabdff1aSopenharmony_ci vbif d2, d17, d7 482cabdff1aSopenharmony_ci vrshr.u16 d3, d0, #4 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ci vadd.s16 d0, d0, d14 485cabdff1aSopenharmony_ci vadd.u16 d12, d16, d20 486cabdff1aSopenharmony_ci vadd.u16 d14, d21, d28 487cabdff1aSopenharmony_ci vsub.s16 d10, d10, d8 488cabdff1aSopenharmony_ci vbif d3, d18, d7 489cabdff1aSopenharmony_ci vrshr.u16 d4, d0, #4 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci vadd.s16 d0, d0, d10 492cabdff1aSopenharmony_ci vadd.u16 d8, d16, d21 493cabdff1aSopenharmony_ci vadd.u16 d10, d22, d29 494cabdff1aSopenharmony_ci vsub.s16 d14, d14, d12 495cabdff1aSopenharmony_ci vbif d4, d19, d7 496cabdff1aSopenharmony_ci vrshr.u16 d5, d0, #4 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci vadd.s16 d0, d0, d14 499cabdff1aSopenharmony_ci vadd.u16 d12, d16, d22 500cabdff1aSopenharmony_ci vadd.u16 d14, d23, d30 501cabdff1aSopenharmony_ci vsub.s16 d10, d10, d8 502cabdff1aSopenharmony_ci vbif d5, d20, d7 503cabdff1aSopenharmony_ci vrshr.u16 d6, d0, #4 504cabdff1aSopenharmony_ci 505cabdff1aSopenharmony_ci vadd.s16 d0, d0, d10 506cabdff1aSopenharmony_ci vadd.u16 d10, d16, d23 507cabdff1aSopenharmony_ci vsub.s16 d14, d14, d12 508cabdff1aSopenharmony_ci vadd.u16 d12, d24, d31 509cabdff1aSopenharmony_ci vbif d6, d21, d7 510cabdff1aSopenharmony_ci vrshr.u16 d8, d0, #4 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ci vadd.s16 d0, d0, d14 513cabdff1aSopenharmony_ci vsub.s16 d10, d12, d10 514cabdff1aSopenharmony_ci vadd.u16 d12, d17, d24 515cabdff1aSopenharmony_ci vadd.u16 d14, d25, d31 516cabdff1aSopenharmony_ci vbif d8, d22, d7 517cabdff1aSopenharmony_ci vrshr.u16 d9, d0, #4 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci vadd.s16 d0, d0, d10 520cabdff1aSopenharmony_ci vsub.s16 d14, d14, d12 521cabdff1aSopenharmony_ci vadd.u16 d12, d26, d31 522cabdff1aSopenharmony_ci vbif d9, d23, d7 523cabdff1aSopenharmony_ci vrshr.u16 d10, d0, #4 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_ci vadd.s16 d0, d0, d14 526cabdff1aSopenharmony_ci vadd.u16 d14, d18, d25 527cabdff1aSopenharmony_ci vadd.u16 d18, d19, d26 528cabdff1aSopenharmony_ci vsub.s16 d12, d12, d14 529cabdff1aSopenharmony_ci vadd.u16 d14, d27, d31 530cabdff1aSopenharmony_ci vbif d10, d24, d7 531cabdff1aSopenharmony_ci vrshr.u16 d11, d0, #4 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci vadd.s16 d0, d0, d12 534cabdff1aSopenharmony_ci vadd.u16 d12, d20, d27 535cabdff1aSopenharmony_ci vsub.s16 d14, d14, d18 536cabdff1aSopenharmony_ci vadd.u16 d18, d28, d31 537cabdff1aSopenharmony_ci vbif d11, d25, d7 538cabdff1aSopenharmony_ci vsub.s16 d18, d18, d12 539cabdff1aSopenharmony_ci vrshr.u16 d12, d0, #4 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci vadd.s16 d0, d0, d14 542cabdff1aSopenharmony_ci vadd.u16 d14, d21, d28 543cabdff1aSopenharmony_ci vadd.u16 d20, d29, d31 544cabdff1aSopenharmony_ci vbif d12, d26, d7 545cabdff1aSopenharmony_ci vrshr.u16 d13, d0, #4 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci vadd.s16 d0, d0, d18 548cabdff1aSopenharmony_ci vsub.s16 d20, d20, d14 549cabdff1aSopenharmony_ci vadd.u16 d18, d22, d29 550cabdff1aSopenharmony_ci vadd.u16 d22, d30, d31 551cabdff1aSopenharmony_ci vbif d13, d27, d7 552cabdff1aSopenharmony_ci vrshr.u16 d14, d0, #4 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci vadd.s16 d0, d0, d20 555cabdff1aSopenharmony_ci vsub.s16 d22, d22, d18 556cabdff1aSopenharmony_ci vbif d14, d28, d7 557cabdff1aSopenharmony_ci vrshr.u16 d15, d0, #4 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci vadd.s16 d0, d0, d22 560cabdff1aSopenharmony_ci vbif d15, d29, d7 561cabdff1aSopenharmony_ci vrshr.u16 d17, d0, #4 562cabdff1aSopenharmony_ci vbif d17, d30, d7 563cabdff1aSopenharmony_ci.endif 564cabdff1aSopenharmony_ci.endm 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci.macro loop_filter_q_4 567cabdff1aSopenharmony_ci loop_filter_q 4 568cabdff1aSopenharmony_ci.endm 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci.macro loop_filter_q_8 571cabdff1aSopenharmony_ci loop_filter_q 8 572cabdff1aSopenharmony_ci.endm 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci.macro loop_filter_16 575cabdff1aSopenharmony_ci loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15 576cabdff1aSopenharmony_ci.endm 577cabdff1aSopenharmony_ci 578cabdff1aSopenharmony_ci 579cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature: 580cabdff1aSopenharmony_ci@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci.macro bpp_frontend func, bpp 583cabdff1aSopenharmony_cifunction ff_\func\()_\bpp\()_neon, export=1 584cabdff1aSopenharmony_ci push {r4-r9,lr} 585cabdff1aSopenharmony_ci ldr r4, [sp, #28] 586cabdff1aSopenharmony_ci vpush {q4-q7} 587cabdff1aSopenharmony_ci lsl r2, r2, #\bpp - 8 588cabdff1aSopenharmony_ci lsl r3, r3, #\bpp - 8 589cabdff1aSopenharmony_ci lsl r4, r4, #\bpp - 8 590cabdff1aSopenharmony_ci mov r5, #1 << (\bpp - 8) 591cabdff1aSopenharmony_ci mov r6, #16 - \bpp 592cabdff1aSopenharmony_ci movw r7, #((1 << \bpp) - 1) 593cabdff1aSopenharmony_ci bl \func\()_16_neon 594cabdff1aSopenharmony_ci vpop {q4-q7} 595cabdff1aSopenharmony_ci pop {r4-r9,pc} 596cabdff1aSopenharmony_ciendfunc 597cabdff1aSopenharmony_ci.endm 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_ci.macro bpp_frontends func 600cabdff1aSopenharmony_ci bpp_frontend \func, 10 601cabdff1aSopenharmony_ci bpp_frontend \func, 12 602cabdff1aSopenharmony_ci.endm 603cabdff1aSopenharmony_ci 604cabdff1aSopenharmony_ci.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp 605cabdff1aSopenharmony_cifunction ff_\func\()_\suffix\()_\bpp\()_neon, export=1 606cabdff1aSopenharmony_ci push {r4-r9,lr} 607cabdff1aSopenharmony_ci ldr r4, [sp, #28] 608cabdff1aSopenharmony_ci vpush {q4-q7} 609cabdff1aSopenharmony_ci lsl r2, r2, #\bpp - 8 610cabdff1aSopenharmony_ci lsl r3, r3, #\bpp - 8 611cabdff1aSopenharmony_ci lsl r4, r4, #\bpp - 8 612cabdff1aSopenharmony_ci mov r5, #1 << (\bpp - 8) 613cabdff1aSopenharmony_ci mov r6, #16 - \bpp 614cabdff1aSopenharmony_ci movw r7, #((1 << \bpp) - 1) 615cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 616cabdff1aSopenharmony_ci.ifc \dir,h 617cabdff1aSopenharmony_ci add r0, r0, r1, lsl #2 618cabdff1aSopenharmony_ci.else 619cabdff1aSopenharmony_ci add r0, r0, #8 620cabdff1aSopenharmony_ci.endif 621cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 622cabdff1aSopenharmony_ci.if \rep >= 4 623cabdff1aSopenharmony_ci.ifc \dir,h 624cabdff1aSopenharmony_ci add r0, r0, r1, lsl #2 625cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 626cabdff1aSopenharmony_ci add r0, r0, r1, lsl #2 627cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 628cabdff1aSopenharmony_ci.else 629cabdff1aSopenharmony_ci add r0, r0, #8 630cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 631cabdff1aSopenharmony_ci add r0, r0, #8 632cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 633cabdff1aSopenharmony_ci.endif 634cabdff1aSopenharmony_ci.endif 635cabdff1aSopenharmony_ci vpop {q4-q7} 636cabdff1aSopenharmony_ci pop {r4-r9,pc} 637cabdff1aSopenharmony_ciendfunc 638cabdff1aSopenharmony_ci.endm 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir 641cabdff1aSopenharmony_ci bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10 642cabdff1aSopenharmony_ci bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12 643cabdff1aSopenharmony_ci.endm 644cabdff1aSopenharmony_ci 645cabdff1aSopenharmony_ci.macro bpp_frontend_mix2 wd1, wd2, dir, bpp 646cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 647cabdff1aSopenharmony_ci push {r4-r9,lr} 648cabdff1aSopenharmony_ci ldr r4, [sp, #28] 649cabdff1aSopenharmony_ci vpush {q4-q7} 650cabdff1aSopenharmony_ci push {r2, r3, r4} 651cabdff1aSopenharmony_ci and r2, r2, #0xff 652cabdff1aSopenharmony_ci and r3, r3, #0xff 653cabdff1aSopenharmony_ci and r4, r4, #0xff 654cabdff1aSopenharmony_ci lsl r2, r2, #\bpp - 8 655cabdff1aSopenharmony_ci lsl r3, r3, #\bpp - 8 656cabdff1aSopenharmony_ci lsl r4, r4, #\bpp - 8 657cabdff1aSopenharmony_ci mov r5, #1 << (\bpp - 8) 658cabdff1aSopenharmony_ci mov r6, #16 - \bpp 659cabdff1aSopenharmony_ci movw r7, #((1 << \bpp) - 1) 660cabdff1aSopenharmony_ci bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon 661cabdff1aSopenharmony_ci.ifc \dir,h 662cabdff1aSopenharmony_ci add r0, r0, r1, lsl #3 663cabdff1aSopenharmony_ci.else 664cabdff1aSopenharmony_ci add r0, r0, #16 665cabdff1aSopenharmony_ci.endif 666cabdff1aSopenharmony_ci pop {r2, r3, r4} 667cabdff1aSopenharmony_ci lsr r2, r2, #8 668cabdff1aSopenharmony_ci lsr r3, r3, #8 669cabdff1aSopenharmony_ci lsr r4, r4, #8 670cabdff1aSopenharmony_ci lsl r2, r2, #\bpp - 8 671cabdff1aSopenharmony_ci lsl r3, r3, #\bpp - 8 672cabdff1aSopenharmony_ci lsl r4, r4, #\bpp - 8 673cabdff1aSopenharmony_ci bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon 674cabdff1aSopenharmony_ci vpop {q4-q7} 675cabdff1aSopenharmony_ci pop {r4-r9,pc} 676cabdff1aSopenharmony_ciendfunc 677cabdff1aSopenharmony_ci.endm 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci.macro bpp_frontends_mix2 wd1, wd2 680cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, v, 10 681cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, v, 12 682cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, h, 10 683cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, h, 12 684cabdff1aSopenharmony_ci.endm 685cabdff1aSopenharmony_ci 686cabdff1aSopenharmony_cifunction vp9_loop_filter_v_4_8_16_neon 687cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #2 688cabdff1aSopenharmony_ci vld1.16 {q8}, [r12,:128], r1 @ p3 689cabdff1aSopenharmony_ci vld1.16 {q12}, [r0, :128], r1 @ q0 690cabdff1aSopenharmony_ci vld1.16 {q9}, [r12,:128], r1 @ p2 691cabdff1aSopenharmony_ci vld1.16 {q13}, [r0, :128], r1 @ q1 692cabdff1aSopenharmony_ci vld1.16 {q10}, [r12,:128], r1 @ p1 693cabdff1aSopenharmony_ci vld1.16 {q14}, [r0, :128], r1 @ q2 694cabdff1aSopenharmony_ci vld1.16 {q11}, [r12,:128], r1 @ p0 695cabdff1aSopenharmony_ci vld1.16 {q15}, [r0, :128], r1 @ q3 696cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 697cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #1 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci loop_filter_q_4 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci vst1.16 {q10}, [r12,:128], r1 702cabdff1aSopenharmony_ci vst1.16 {q12}, [r0, :128], r1 703cabdff1aSopenharmony_ci vst1.16 {q11}, [r12,:128], r1 704cabdff1aSopenharmony_ci vst1.16 {q13}, [r0, :128], r1 705cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 706cabdff1aSopenharmony_ci9: 707cabdff1aSopenharmony_ci bx lr 708cabdff1aSopenharmony_ciendfunc 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_4_8 711cabdff1aSopenharmony_ci 712cabdff1aSopenharmony_ci 713cabdff1aSopenharmony_cifunction vp9_loop_filter_h_4_8_16_neon 714cabdff1aSopenharmony_ci sub r12, r0, #8 715cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 716cabdff1aSopenharmony_ci vld1.16 {q8}, [r12,:64], r1 717cabdff1aSopenharmony_ci vld1.16 {q12}, [r0, :64], r1 718cabdff1aSopenharmony_ci vld1.16 {q9}, [r12,:64], r1 719cabdff1aSopenharmony_ci vld1.16 {q13}, [r0, :64], r1 720cabdff1aSopenharmony_ci vld1.16 {q10}, [r12,:64], r1 721cabdff1aSopenharmony_ci vld1.16 {q14}, [r0, :64], r1 722cabdff1aSopenharmony_ci vld1.16 {q11}, [r12,:64], r1 723cabdff1aSopenharmony_ci vld1.16 {q15}, [r0, :64], r1 724cabdff1aSopenharmony_ci 725cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 726cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 727cabdff1aSopenharmony_ci @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the 728cabdff1aSopenharmony_ci @ outermost 2 pixels since they aren't changed. 729cabdff1aSopenharmony_ci add r12, r12, #4 730cabdff1aSopenharmony_ci add r0, r0, #4 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 733cabdff1aSopenharmony_ci 734cabdff1aSopenharmony_ci loop_filter_q_4 735cabdff1aSopenharmony_ci 736cabdff1aSopenharmony_ci @ We only will write the mid 4 pixels back; after the loop filter, 737cabdff1aSopenharmony_ci @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels). 738cabdff1aSopenharmony_ci @ We need to transpose them to columns, done with a 739cabdff1aSopenharmony_ci @ 4x4 transpose (which in practice is two 4x4 transposes of the two 740cabdff1aSopenharmony_ci @ 4x4 halves of the 8x4 pixels; into 4x8 pixels). 741cabdff1aSopenharmony_ci transpose16_4x4 q10, q11, q12, q13 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_ci vst1.16 {d20}, [r12], r1 744cabdff1aSopenharmony_ci vst1.16 {d21}, [r0], r1 745cabdff1aSopenharmony_ci vst1.16 {d22}, [r12], r1 746cabdff1aSopenharmony_ci vst1.16 {d23}, [r0], r1 747cabdff1aSopenharmony_ci vst1.16 {d24}, [r12], r1 748cabdff1aSopenharmony_ci vst1.16 {d25}, [r0], r1 749cabdff1aSopenharmony_ci vst1.16 {d26}, [r12], r1 750cabdff1aSopenharmony_ci vst1.16 {d27}, [r0], r1 751cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 752cabdff1aSopenharmony_ci9: 753cabdff1aSopenharmony_ci add r0, r12, #4 754cabdff1aSopenharmony_ci bx lr 755cabdff1aSopenharmony_ciendfunc 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_4_8 758cabdff1aSopenharmony_ci 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_cifunction vp9_loop_filter_v_8_8_16_neon 761cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #2 762cabdff1aSopenharmony_ci vld1.16 {q8}, [r12,:128], r1 @ p3 763cabdff1aSopenharmony_ci vld1.16 {q12}, [r0, :128], r1 @ q0 764cabdff1aSopenharmony_ci vld1.16 {q9}, [r12,:128], r1 @ p2 765cabdff1aSopenharmony_ci vld1.16 {q13}, [r0, :128], r1 @ q1 766cabdff1aSopenharmony_ci vld1.16 {q10}, [r12,:128], r1 @ p1 767cabdff1aSopenharmony_ci vld1.16 {q14}, [r0, :128], r1 @ q2 768cabdff1aSopenharmony_ci vld1.16 {q11}, [r12,:128], r1 @ p0 769cabdff1aSopenharmony_ci vld1.16 {q15}, [r0, :128], r1 @ q3 770cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 771cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 772cabdff1aSopenharmony_ci add r12, r12, r1 773cabdff1aSopenharmony_ci 774cabdff1aSopenharmony_ci loop_filter_q_8 775cabdff1aSopenharmony_ci 776cabdff1aSopenharmony_ci vst1.16 {q9}, [r12,:128], r1 777cabdff1aSopenharmony_ci vst1.16 {q12}, [r0, :128], r1 778cabdff1aSopenharmony_ci vst1.16 {q10}, [r12,:128], r1 779cabdff1aSopenharmony_ci vst1.16 {q13}, [r0, :128], r1 780cabdff1aSopenharmony_ci vst1.16 {q11}, [r12,:128], r1 781cabdff1aSopenharmony_ci vst1.16 {q14}, [r0, :128], r1 782cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 783cabdff1aSopenharmony_ci sub r0, r0, r1 784cabdff1aSopenharmony_ci9: 785cabdff1aSopenharmony_ci bx lr 786cabdff1aSopenharmony_ci6: 787cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #1 788cabdff1aSopenharmony_ci vst1.16 {q10}, [r12,:128], r1 789cabdff1aSopenharmony_ci vst1.16 {q12}, [r0, :128], r1 790cabdff1aSopenharmony_ci vst1.16 {q11}, [r12,:128], r1 791cabdff1aSopenharmony_ci vst1.16 {q13}, [r0, :128], r1 792cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 793cabdff1aSopenharmony_ci bx lr 794cabdff1aSopenharmony_ciendfunc 795cabdff1aSopenharmony_ci 796cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_8_8 797cabdff1aSopenharmony_ci 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_cifunction vp9_loop_filter_h_8_8_16_neon 800cabdff1aSopenharmony_ci sub r12, r0, #8 801cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 802cabdff1aSopenharmony_ci vld1.16 {q8}, [r12,:64], r1 803cabdff1aSopenharmony_ci vld1.16 {q12}, [r0, :64], r1 804cabdff1aSopenharmony_ci vld1.16 {q9}, [r12,:64], r1 805cabdff1aSopenharmony_ci vld1.16 {q13}, [r0, :64], r1 806cabdff1aSopenharmony_ci vld1.16 {q10}, [r12,:64], r1 807cabdff1aSopenharmony_ci vld1.16 {q14}, [r0, :64], r1 808cabdff1aSopenharmony_ci vld1.16 {q11}, [r12,:64], r1 809cabdff1aSopenharmony_ci vld1.16 {q15}, [r0, :64], r1 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 812cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 815cabdff1aSopenharmony_ci 816cabdff1aSopenharmony_ci loop_filter_q_8 817cabdff1aSopenharmony_ci 818cabdff1aSopenharmony_ci @ Even though only 6 pixels per row have been changed, we write the 819cabdff1aSopenharmony_ci @ full 8 pixel registers. 820cabdff1aSopenharmony_ci transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 821cabdff1aSopenharmony_ci 822cabdff1aSopenharmony_ci vst1.16 {q8}, [r12,:64], r1 823cabdff1aSopenharmony_ci vst1.16 {q12}, [r0, :64], r1 824cabdff1aSopenharmony_ci vst1.16 {q9}, [r12,:64], r1 825cabdff1aSopenharmony_ci vst1.16 {q13}, [r0, :64], r1 826cabdff1aSopenharmony_ci vst1.16 {q10}, [r12,:64], r1 827cabdff1aSopenharmony_ci vst1.16 {q14}, [r0, :64], r1 828cabdff1aSopenharmony_ci vst1.16 {q11}, [r12,:64], r1 829cabdff1aSopenharmony_ci vst1.16 {q15}, [r0, :64], r1 830cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 831cabdff1aSopenharmony_ci9: 832cabdff1aSopenharmony_ci add r0, r12, #8 833cabdff1aSopenharmony_ci bx lr 834cabdff1aSopenharmony_ci6: 835cabdff1aSopenharmony_ci @ If we didn't need to do the flat8in part, we use the same writeback 836cabdff1aSopenharmony_ci @ as in loop_filter_h_4_8. 837cabdff1aSopenharmony_ci add r12, r12, #4 838cabdff1aSopenharmony_ci add r0, r0, #4 839cabdff1aSopenharmony_ci transpose16_4x4 q10, q11, q12, q13 840cabdff1aSopenharmony_ci 841cabdff1aSopenharmony_ci vst1.16 {d20}, [r12], r1 842cabdff1aSopenharmony_ci vst1.16 {d21}, [r0], r1 843cabdff1aSopenharmony_ci vst1.16 {d22}, [r12], r1 844cabdff1aSopenharmony_ci vst1.16 {d23}, [r0], r1 845cabdff1aSopenharmony_ci vst1.16 {d24}, [r12], r1 846cabdff1aSopenharmony_ci vst1.16 {d25}, [r0], r1 847cabdff1aSopenharmony_ci vst1.16 {d26}, [r12], r1 848cabdff1aSopenharmony_ci vst1.16 {d27}, [r0], r1 849cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 850cabdff1aSopenharmony_ci add r0, r12, #4 851cabdff1aSopenharmony_ci bx lr 852cabdff1aSopenharmony_ciendfunc 853cabdff1aSopenharmony_ci 854cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_8_8 855cabdff1aSopenharmony_ci 856cabdff1aSopenharmony_cibpp_frontends_mix2 4, 4 857cabdff1aSopenharmony_cibpp_frontends_mix2 4, 8 858cabdff1aSopenharmony_cibpp_frontends_mix2 8, 4 859cabdff1aSopenharmony_cibpp_frontends_mix2 8, 8 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_cifunction vp9_loop_filter_v_16_4_16_neon 862cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #3 863cabdff1aSopenharmony_ci @ Read p7-p0 using r12 and q0-q7 using r0 864cabdff1aSopenharmony_ci vld1.16 {d16}, [r12,:64], r1 @ p7 865cabdff1aSopenharmony_ci vld1.16 {d24}, [r0, :64], r1 @ q0 866cabdff1aSopenharmony_ci vld1.16 {d17}, [r12,:64], r1 @ p6 867cabdff1aSopenharmony_ci vld1.16 {d25}, [r0, :64], r1 @ q1 868cabdff1aSopenharmony_ci vld1.16 {d18}, [r12,:64], r1 @ p5 869cabdff1aSopenharmony_ci vld1.16 {d26}, [r0, :64], r1 @ q2 870cabdff1aSopenharmony_ci vld1.16 {d19}, [r12,:64], r1 @ p4 871cabdff1aSopenharmony_ci vld1.16 {d27}, [r0, :64], r1 @ q3 872cabdff1aSopenharmony_ci vld1.16 {d20}, [r12,:64], r1 @ p3 873cabdff1aSopenharmony_ci vld1.16 {d28}, [r0, :64], r1 @ q4 874cabdff1aSopenharmony_ci vld1.16 {d21}, [r12,:64], r1 @ p2 875cabdff1aSopenharmony_ci vld1.16 {d29}, [r0, :64], r1 @ q5 876cabdff1aSopenharmony_ci vld1.16 {d22}, [r12,:64], r1 @ p1 877cabdff1aSopenharmony_ci vld1.16 {d30}, [r0, :64], r1 @ q6 878cabdff1aSopenharmony_ci vld1.16 {d23}, [r12,:64], r1 @ p0 879cabdff1aSopenharmony_ci vld1.16 {d31}, [r0, :64], r1 @ q7 880cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #3 881cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 882cabdff1aSopenharmony_ci add r12, r12, r1 883cabdff1aSopenharmony_ci 884cabdff1aSopenharmony_ci loop_filter_16 885cabdff1aSopenharmony_ci 886cabdff1aSopenharmony_ci @ If we did the flat8out part, we get the output in 887cabdff1aSopenharmony_ci @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride, 888cabdff1aSopenharmony_ci @ store d2-d9 there, and d10-d17 into r0. 889cabdff1aSopenharmony_ci vst1.16 {d2}, [r12,:64], r1 890cabdff1aSopenharmony_ci vst1.16 {d10}, [r0, :64], r1 891cabdff1aSopenharmony_ci vst1.16 {d3}, [r12,:64], r1 892cabdff1aSopenharmony_ci vst1.16 {d11}, [r0, :64], r1 893cabdff1aSopenharmony_ci vst1.16 {d4}, [r12,:64], r1 894cabdff1aSopenharmony_ci vst1.16 {d12}, [r0, :64], r1 895cabdff1aSopenharmony_ci vst1.16 {d5}, [r12,:64], r1 896cabdff1aSopenharmony_ci vst1.16 {d13}, [r0, :64], r1 897cabdff1aSopenharmony_ci vst1.16 {d6}, [r12,:64], r1 898cabdff1aSopenharmony_ci vst1.16 {d14}, [r0, :64], r1 899cabdff1aSopenharmony_ci vst1.16 {d8}, [r12,:64], r1 900cabdff1aSopenharmony_ci vst1.16 {d15}, [r0, :64], r1 901cabdff1aSopenharmony_ci vst1.16 {d9}, [r12,:64], r1 902cabdff1aSopenharmony_ci vst1.16 {d17}, [r0, :64], r1 903cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 904cabdff1aSopenharmony_ci add r0, r0, r1 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_ci9: 907cabdff1aSopenharmony_ci bx lr 908cabdff1aSopenharmony_ci 909cabdff1aSopenharmony_ci8: 910cabdff1aSopenharmony_ci add r12, r12, r1, lsl #2 911cabdff1aSopenharmony_ci @ If we didn't do the flat8out part, the output is left in the 912cabdff1aSopenharmony_ci @ input registers. 913cabdff1aSopenharmony_ci vst1.16 {d21}, [r12,:64], r1 914cabdff1aSopenharmony_ci vst1.16 {d24}, [r0, :64], r1 915cabdff1aSopenharmony_ci vst1.16 {d22}, [r12,:64], r1 916cabdff1aSopenharmony_ci vst1.16 {d25}, [r0, :64], r1 917cabdff1aSopenharmony_ci vst1.16 {d23}, [r12,:64], r1 918cabdff1aSopenharmony_ci vst1.16 {d26}, [r0, :64], r1 919cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 920cabdff1aSopenharmony_ci sub r0, r0, r1 921cabdff1aSopenharmony_ci bx lr 922cabdff1aSopenharmony_ci7: 923cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #1 924cabdff1aSopenharmony_ci vst1.16 {d22}, [r12,:64], r1 925cabdff1aSopenharmony_ci vst1.16 {d24}, [r0, :64], r1 926cabdff1aSopenharmony_ci vst1.16 {d23}, [r12,:64], r1 927cabdff1aSopenharmony_ci vst1.16 {d25}, [r0, :64], r1 928cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 929cabdff1aSopenharmony_ci bx lr 930cabdff1aSopenharmony_ciendfunc 931cabdff1aSopenharmony_ci 932cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_v_16, 8, 4, 2, v 933cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v 934cabdff1aSopenharmony_ci 935cabdff1aSopenharmony_cifunction vp9_loop_filter_h_16_4_16_neon 936cabdff1aSopenharmony_ci sub r12, r0, #16 937cabdff1aSopenharmony_ci sub r0, r0, #8 938cabdff1aSopenharmony_ci vld1.16 {d16}, [r12,:64], r1 939cabdff1aSopenharmony_ci vld1.16 {d20}, [r0, :64], r1 940cabdff1aSopenharmony_ci vld1.16 {d17}, [r12,:64], r1 941cabdff1aSopenharmony_ci vld1.16 {d21}, [r0, :64], r1 942cabdff1aSopenharmony_ci vld1.16 {d18}, [r12,:64], r1 943cabdff1aSopenharmony_ci vld1.16 {d22}, [r0, :64], r1 944cabdff1aSopenharmony_ci vld1.16 {d19}, [r12,:64], r1 945cabdff1aSopenharmony_ci vld1.16 {d23}, [r0, :64], r1 946cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 947cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 948cabdff1aSopenharmony_ci add r12, r12, #16 949cabdff1aSopenharmony_ci add r0, r0, #16 950cabdff1aSopenharmony_ci vld1.16 {d24}, [r12,:64], r1 951cabdff1aSopenharmony_ci vld1.16 {d28}, [r0, :64], r1 952cabdff1aSopenharmony_ci vld1.16 {d25}, [r12,:64], r1 953cabdff1aSopenharmony_ci vld1.16 {d29}, [r0, :64], r1 954cabdff1aSopenharmony_ci vld1.16 {d26}, [r12,:64], r1 955cabdff1aSopenharmony_ci vld1.16 {d30}, [r0, :64], r1 956cabdff1aSopenharmony_ci vld1.16 {d27}, [r12,:64], r1 957cabdff1aSopenharmony_ci vld1.16 {d31}, [r0, :64], r1 958cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 959cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 960cabdff1aSopenharmony_ci sub r12, r12, #16 961cabdff1aSopenharmony_ci sub r0, r0, #16 962cabdff1aSopenharmony_ci 963cabdff1aSopenharmony_ci @ The 16x4 pixels read above is in four 4x4 blocks 964cabdff1aSopenharmony_ci transpose16_q_4x4 q8, q9, d16, d17, d18, d19 965cabdff1aSopenharmony_ci transpose16_q_4x4 q10, q11, d20, d21, d22, d23 966cabdff1aSopenharmony_ci transpose16_q_4x4 q12, q13, d24, d25, d26, d27 967cabdff1aSopenharmony_ci transpose16_q_4x4 q14, q15, d28, d29, d30, d31 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ci loop_filter_16 970cabdff1aSopenharmony_ci 971cabdff1aSopenharmony_ci @ Transpose back; this is the same transpose as above, but 972cabdff1aSopenharmony_ci @ we can't take advantage of q registers for the transpose, since 973cabdff1aSopenharmony_ci @ all d registers in the transpose aren't consecutive. 974cabdff1aSopenharmony_ci transpose16_4x4 d16, d2, d3, d4 975cabdff1aSopenharmony_ci transpose16_4x4 d5, d6, d8, d9 976cabdff1aSopenharmony_ci transpose16_4x4 d10, d11, d12, d13 977cabdff1aSopenharmony_ci transpose16_4x4 d14, d15, d17, d31 978cabdff1aSopenharmony_ci 979cabdff1aSopenharmony_ci vst1.16 {d16}, [r12,:64], r1 980cabdff1aSopenharmony_ci vst1.16 {d5}, [r0, :64], r1 981cabdff1aSopenharmony_ci 982cabdff1aSopenharmony_ci vst1.16 {d2}, [r12,:64], r1 983cabdff1aSopenharmony_ci vst1.16 {d6}, [r0, :64], r1 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci vst1.16 {d3}, [r12,:64], r1 986cabdff1aSopenharmony_ci vst1.16 {d8}, [r0, :64], r1 987cabdff1aSopenharmony_ci 988cabdff1aSopenharmony_ci vst1.16 {d4}, [r12,:64], r1 989cabdff1aSopenharmony_ci vst1.16 {d9}, [r0, :64], r1 990cabdff1aSopenharmony_ci 991cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 992cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 993cabdff1aSopenharmony_ci add r12, r12, #16 994cabdff1aSopenharmony_ci add r0, r0, #16 995cabdff1aSopenharmony_ci 996cabdff1aSopenharmony_ci vst1.16 {d10}, [r12,:64], r1 997cabdff1aSopenharmony_ci vst1.16 {d14}, [r0, :64], r1 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_ci vst1.16 {d11}, [r12,:64], r1 1000cabdff1aSopenharmony_ci vst1.16 {d15}, [r0, :64], r1 1001cabdff1aSopenharmony_ci 1002cabdff1aSopenharmony_ci vst1.16 {d12}, [r12,:64], r1 1003cabdff1aSopenharmony_ci vst1.16 {d17}, [r0, :64], r1 1004cabdff1aSopenharmony_ci 1005cabdff1aSopenharmony_ci vst1.16 {d13}, [r12,:64], r1 1006cabdff1aSopenharmony_ci vst1.16 {d31}, [r0, :64], r1 1007cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 1008cabdff1aSopenharmony_ci sub r0, r0, #8 1009cabdff1aSopenharmony_ci bx lr 1010cabdff1aSopenharmony_ci9: 1011cabdff1aSopenharmony_ci add r0, r0, #8 1012cabdff1aSopenharmony_ci bx lr 1013cabdff1aSopenharmony_ci8: 1014cabdff1aSopenharmony_ci add r12, r12, #8 1015cabdff1aSopenharmony_ci add r0, r0, #8 1016cabdff1aSopenharmony_ci transpose16_q_4x4 q10, q11, d20, d21, d22, d23 1017cabdff1aSopenharmony_ci transpose16_q_4x4 q12, q13, d24, d25, d26, d27 1018cabdff1aSopenharmony_ci 1019cabdff1aSopenharmony_ci vst1.16 {d20}, [r12,:64], r1 1020cabdff1aSopenharmony_ci vst1.16 {d24}, [r0, :64], r1 1021cabdff1aSopenharmony_ci vst1.16 {d21}, [r12,:64], r1 1022cabdff1aSopenharmony_ci vst1.16 {d25}, [r0, :64], r1 1023cabdff1aSopenharmony_ci vst1.16 {d22}, [r12,:64], r1 1024cabdff1aSopenharmony_ci vst1.16 {d26}, [r0, :64], r1 1025cabdff1aSopenharmony_ci vst1.16 {d23}, [r12,:64], r1 1026cabdff1aSopenharmony_ci vst1.16 {d27}, [r0, :64], r1 1027cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 1028cabdff1aSopenharmony_ci bx lr 1029cabdff1aSopenharmony_ci7: 1030cabdff1aSopenharmony_ci add r12, r12, #12 1031cabdff1aSopenharmony_ci add r0, r12, r1, lsl #1 1032cabdff1aSopenharmony_ci transpose16_q_4x4 q11, q12, d22, d23, d24, d25 1033cabdff1aSopenharmony_ci 1034cabdff1aSopenharmony_ci vst1.16 {d22}, [r12], r1 1035cabdff1aSopenharmony_ci vst1.16 {d24}, [r0], r1 1036cabdff1aSopenharmony_ci vst1.16 {d23}, [r12], r1 1037cabdff1aSopenharmony_ci vst1.16 {d25}, [r0], r1 1038cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 1039cabdff1aSopenharmony_ci add r0, r0, #4 1040cabdff1aSopenharmony_ci bx lr 1041cabdff1aSopenharmony_ciendfunc 1042cabdff1aSopenharmony_ci 1043cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_h_16, 8, 4, 2, h 1044cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h 1045