1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 22cabdff1aSopenharmony_ci#include "neon.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci@ Do an 8x8 transpose, using q registers for the subtransposes that don't 25cabdff1aSopenharmony_ci@ need to address the indiviudal d registers. 26cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1, etc 27cabdff1aSopenharmony_ci.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7 28cabdff1aSopenharmony_ci vtrn.32 \rq0, \rq2 29cabdff1aSopenharmony_ci vtrn.32 \rq1, \rq3 30cabdff1aSopenharmony_ci vtrn.16 \rq0, \rq1 31cabdff1aSopenharmony_ci vtrn.16 \rq2, \rq3 32cabdff1aSopenharmony_ci vtrn.8 \r0, \r1 33cabdff1aSopenharmony_ci vtrn.8 \r2, \r3 34cabdff1aSopenharmony_ci vtrn.8 \r4, \r5 35cabdff1aSopenharmony_ci vtrn.8 \r6, \r7 36cabdff1aSopenharmony_ci.endm 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci@ Do a 4x4 transpose, using q registers for the subtransposes that don't 39cabdff1aSopenharmony_ci@ need to address the indiviudal d registers. 40cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1 41cabdff1aSopenharmony_ci.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3 42cabdff1aSopenharmony_ci vtrn.16 \rq0, \rq1 43cabdff1aSopenharmony_ci vtrn.8 \r0, \r1 44cabdff1aSopenharmony_ci vtrn.8 \r2, \r3 45cabdff1aSopenharmony_ci.endm 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers q8-q15, 48cabdff1aSopenharmony_ci@ and q0-q7 are used as scratch registers. 49cabdff1aSopenharmony_ci@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 50cabdff1aSopenharmony_ci.macro loop_filter_q 51cabdff1aSopenharmony_ci vdup.u8 d0, r2 @ E 52cabdff1aSopenharmony_ci lsr r2, r2, #8 53cabdff1aSopenharmony_ci vdup.u8 d2, r3 @ I 54cabdff1aSopenharmony_ci lsr r3, r3, #8 55cabdff1aSopenharmony_ci vdup.u8 d1, r2 @ E 56cabdff1aSopenharmony_ci vdup.u8 d3, r3 @ I 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci vabd.u8 q2, q8, q9 @ abs(p3 - p2) 59cabdff1aSopenharmony_ci vabd.u8 q3, q9, q10 @ abs(p2 - p1) 60cabdff1aSopenharmony_ci vabd.u8 q4, q10, q11 @ abs(p1 - p0) 61cabdff1aSopenharmony_ci vabd.u8 q5, q12, q13 @ abs(q0 - q1) 62cabdff1aSopenharmony_ci vabd.u8 q6, q13, q14 @ abs(q1 - q2) 63cabdff1aSopenharmony_ci vabd.u8 q7, q14, q15 @ abs(q2 - q3) 64cabdff1aSopenharmony_ci vmax.u8 q2, q2, q3 65cabdff1aSopenharmony_ci vmax.u8 q3, q4, q5 66cabdff1aSopenharmony_ci vmax.u8 q4, q6, q7 67cabdff1aSopenharmony_ci vabd.u8 q5, q11, q12 @ abs(p0 - q0) 68cabdff1aSopenharmony_ci vmax.u8 q2, q2, q3 69cabdff1aSopenharmony_ci vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2 70cabdff1aSopenharmony_ci vabd.u8 q7, q10, q13 @ abs(p1 - q1) 71cabdff1aSopenharmony_ci vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 72cabdff1aSopenharmony_ci vshr.u8 q7, q7, #1 73cabdff1aSopenharmony_ci vcle.u8 q2, q2, q1 @ max(abs()) <= I 74cabdff1aSopenharmony_ci vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 75cabdff1aSopenharmony_ci vcle.u8 q5, q5, q0 76cabdff1aSopenharmony_ci vand q2, q2, q5 @ fm 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci vshrn.u16 d10, q2, #4 79cabdff1aSopenharmony_ci vmov r2, r3, d10 80cabdff1aSopenharmony_ci orrs r2, r2, r3 81cabdff1aSopenharmony_ci @ If no pixels need filtering, just exit as soon as possible 82cabdff1aSopenharmony_ci beq 9f 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci @ Calculate the normal inner loop filter for 2 or 4 pixels 85cabdff1aSopenharmony_ci ldr r3, [sp, #64] 86cabdff1aSopenharmony_ci vabd.u8 q3, q10, q11 @ abs(p1 - p0) 87cabdff1aSopenharmony_ci vabd.u8 q4, q13, q12 @ abs(q1 - q0) 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci vsubl.u8 q5, d20, d26 @ p1 - q1 90cabdff1aSopenharmony_ci vsubl.u8 q6, d21, d27 @ p1 - q1 91cabdff1aSopenharmony_ci vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) 92cabdff1aSopenharmony_ci vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1) 93cabdff1aSopenharmony_ci vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1) 94cabdff1aSopenharmony_ci vdup.u8 d8, r3 @ H 95cabdff1aSopenharmony_ci lsr r3, r3, #8 96cabdff1aSopenharmony_ci vdup.u8 d9, r3 @ H 97cabdff1aSopenharmony_ci vsubl.u8 q6, d24, d22 @ q0 - p0 98cabdff1aSopenharmony_ci vsubl.u8 q7, d25, d23 @ q0 - p0 99cabdff1aSopenharmony_ci vcle.u8 q3, q3, q4 @ hev 100cabdff1aSopenharmony_ci vmov.s16 q0, #3 101cabdff1aSopenharmony_ci vand q3, q3, q2 @ !hev && fm && !flat8in 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci vmul.s16 q6, q6, q0 @ 3 * (q0 - p0) 104cabdff1aSopenharmony_ci vmul.s16 q7, q7, q0 @ 3 * (q0 - p0) 105cabdff1aSopenharmony_ci vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0 106cabdff1aSopenharmony_ci vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 107cabdff1aSopenharmony_ci vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 108cabdff1aSopenharmony_ci vmov.s8 q5, #4 109cabdff1aSopenharmony_ci vqmovn.s16 d12, q6 110cabdff1aSopenharmony_ci vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f 111cabdff1aSopenharmony_ci vmov.s8 q0, #3 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127) 114cabdff1aSopenharmony_ci vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127) 115cabdff1aSopenharmony_ci vmovl.u8 q6, d22 @ p0 116cabdff1aSopenharmony_ci vmovl.u8 q7, d23 @ p0 117cabdff1aSopenharmony_ci vshr.s8 q5, q5, #3 @ f1 118cabdff1aSopenharmony_ci vshr.s8 q0, q0, #3 @ f2 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci vaddw.s8 q6, q6, d0 @ p0 + f2 121cabdff1aSopenharmony_ci vaddw.s8 q7, q7, d1 @ p0 + f2 122cabdff1aSopenharmony_ci vqmovun.s16 d0, q6 @ out p0 123cabdff1aSopenharmony_ci vmovl.u8 q6, d24 @ q0 124cabdff1aSopenharmony_ci vqmovun.s16 d1, q7 @ out p0 125cabdff1aSopenharmony_ci vmovl.u8 q7, d25 @ q0 126cabdff1aSopenharmony_ci vsubw.s8 q6, q6, d10 @ q0 - f1 127cabdff1aSopenharmony_ci vsubw.s8 q7, q7, d11 @ q0 - f1 128cabdff1aSopenharmony_ci vqmovun.s16 d12, q6 @ out q0 129cabdff1aSopenharmony_ci vqmovun.s16 d13, q7 @ out q0 130cabdff1aSopenharmony_ci vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1 131cabdff1aSopenharmony_ci vbit q11, q0, q2 @ if (fm && !flat8in) 132cabdff1aSopenharmony_ci vbit q12, q6, q2 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci vmovl.u8 q0, d20 @ p1 135cabdff1aSopenharmony_ci vmovl.u8 q2, d21 @ p1 136cabdff1aSopenharmony_ci vmovl.u8 q6, d26 @ q1 137cabdff1aSopenharmony_ci vmovl.u8 q7, d27 @ q1 138cabdff1aSopenharmony_ci vaddw.s8 q0, q0, d10 @ p1 + f 139cabdff1aSopenharmony_ci vaddw.s8 q2, q2, d11 @ p1 + f 140cabdff1aSopenharmony_ci vsubw.s8 q6, q6, d10 @ q1 - f 141cabdff1aSopenharmony_ci vsubw.s8 q7, q7, d11 @ q1 - f 142cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 @ out p1 143cabdff1aSopenharmony_ci vqmovun.s16 d1, q2 @ out p1 144cabdff1aSopenharmony_ci vqmovun.s16 d12, q6 @ out q1 145cabdff1aSopenharmony_ci vqmovun.s16 d13, q7 @ out q1 146cabdff1aSopenharmony_ci vbit q10, q0, q3 @ if (!hev && fm && !flat8in) 147cabdff1aSopenharmony_ci vbit q13, q6, q3 148cabdff1aSopenharmony_ci.endm 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers d16-d31, 151cabdff1aSopenharmony_ci@ and d0-d7 are used as scratch registers. 152cabdff1aSopenharmony_ci@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31 153cabdff1aSopenharmony_ci@ Depending on the width of the loop filter, we either use d16-d19 154cabdff1aSopenharmony_ci@ and d28-d31 as temp registers, or d8-d15. 155cabdff1aSopenharmony_ci@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 156cabdff1aSopenharmony_ci.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4 157cabdff1aSopenharmony_ci vdup.u8 d0, r2 @ E 158cabdff1aSopenharmony_ci vdup.u8 d2, r3 @ I 159cabdff1aSopenharmony_ci ldr r3, [sp] 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci vabd.u8 d4, d20, d21 @ abs(p3 - p2) 162cabdff1aSopenharmony_ci vabd.u8 d5, d21, d22 @ abs(p2 - p1) 163cabdff1aSopenharmony_ci vabd.u8 d6, d22, d23 @ abs(p1 - p0) 164cabdff1aSopenharmony_ci vabd.u8 d7, d24, d25 @ abs(q0 - q1) 165cabdff1aSopenharmony_ci vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2) 166cabdff1aSopenharmony_ci vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3) 167cabdff1aSopenharmony_ci vmax.u8 d4, d4, d5 168cabdff1aSopenharmony_ci vmax.u8 d5, d6, d7 169cabdff1aSopenharmony_ci vmax.u8 \tmp1, \tmp1, \tmp2 170cabdff1aSopenharmony_ci vabd.u8 d6, d23, d24 @ abs(p0 - q0) 171cabdff1aSopenharmony_ci vmax.u8 d4, d4, d5 172cabdff1aSopenharmony_ci vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2 173cabdff1aSopenharmony_ci vabd.u8 d5, d22, d25 @ abs(p1 - q1) 174cabdff1aSopenharmony_ci vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 175cabdff1aSopenharmony_ci vshr.u8 d5, d5, #1 176cabdff1aSopenharmony_ci vcle.u8 d4, d4, d2 @ max(abs()) <= I 177cabdff1aSopenharmony_ci vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 178cabdff1aSopenharmony_ci vcle.u8 d5, d6, d0 179cabdff1aSopenharmony_ci vand d4, d4, d5 @ fm 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci vdup.u8 d3, r3 @ H 182cabdff1aSopenharmony_ci vmov r2, r3, d4 183cabdff1aSopenharmony_ci orrs r2, r2, r3 184cabdff1aSopenharmony_ci @ If no pixels need filtering, just exit as soon as possible 185cabdff1aSopenharmony_ci beq 9f 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci.if \wd >= 8 188cabdff1aSopenharmony_ci vmov.u8 d0, #1 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ci vabd.u8 d6, d20, d23 @ abs(p3 - p0) 191cabdff1aSopenharmony_ci vabd.u8 d2, d21, d23 @ abs(p2 - p0) 192cabdff1aSopenharmony_ci vabd.u8 d1, d22, d23 @ abs(p1 - p0) 193cabdff1aSopenharmony_ci vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0) 194cabdff1aSopenharmony_ci vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0) 195cabdff1aSopenharmony_ci vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0) 196cabdff1aSopenharmony_ci vmax.u8 d6, d6, d2 197cabdff1aSopenharmony_ci vmax.u8 d1, d1, \tmp1 198cabdff1aSopenharmony_ci vmax.u8 \tmp2, \tmp2, \tmp3 199cabdff1aSopenharmony_ci.if \wd == 16 200cabdff1aSopenharmony_ci vabd.u8 d7, d16, d23 @ abs(p7 - p0) 201cabdff1aSopenharmony_ci vmax.u8 d6, d6, d1 202cabdff1aSopenharmony_ci vabd.u8 d2, d17, d23 @ abs(p6 - p0) 203cabdff1aSopenharmony_ci vmax.u8 d6, d6, \tmp2 204cabdff1aSopenharmony_ci vabd.u8 d1, d18, d23 @ abs(p5 - p0) 205cabdff1aSopenharmony_ci vcle.u8 d6, d6, d0 @ flat8in 206cabdff1aSopenharmony_ci vabd.u8 d8, d19, d23 @ abs(p4 - p0) 207cabdff1aSopenharmony_ci vand d6, d6, d4 @ flat8in && fm 208cabdff1aSopenharmony_ci vabd.u8 d9, d28, d24 @ abs(q4 - q0) 209cabdff1aSopenharmony_ci vbic d4, d4, d6 @ fm && !flat8in 210cabdff1aSopenharmony_ci vabd.u8 d10, d29, d24 @ abs(q5 - q0) 211cabdff1aSopenharmony_ci vabd.u8 d11, d30, d24 @ abs(q6 - q0) 212cabdff1aSopenharmony_ci vabd.u8 d12, d31, d24 @ abs(q7 - q0) 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci vmax.u8 d7, d7, d2 215cabdff1aSopenharmony_ci vmax.u8 d1, d1, d8 216cabdff1aSopenharmony_ci vmax.u8 d9, d9, d10 217cabdff1aSopenharmony_ci vmax.u8 d11, d11, d12 218cabdff1aSopenharmony_ci @ The rest of the calculation of flat8out is interleaved below 219cabdff1aSopenharmony_ci.else 220cabdff1aSopenharmony_ci @ The rest of the calculation of flat8in is interleaved below 221cabdff1aSopenharmony_ci.endif 222cabdff1aSopenharmony_ci.endif 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci @ Calculate the normal inner loop filter for 2 or 4 pixels 225cabdff1aSopenharmony_ci vabd.u8 d5, d22, d23 @ abs(p1 - p0) 226cabdff1aSopenharmony_ci.if \wd == 16 227cabdff1aSopenharmony_ci vmax.u8 d7, d7, d1 228cabdff1aSopenharmony_ci vmax.u8 d9, d9, d11 229cabdff1aSopenharmony_ci.elseif \wd == 8 230cabdff1aSopenharmony_ci vmax.u8 d6, d6, d1 231cabdff1aSopenharmony_ci.endif 232cabdff1aSopenharmony_ci vabd.u8 d1, d25, d24 @ abs(q1 - q0) 233cabdff1aSopenharmony_ci.if \wd == 16 234cabdff1aSopenharmony_ci vmax.u8 d7, d7, d9 235cabdff1aSopenharmony_ci.elseif \wd == 8 236cabdff1aSopenharmony_ci vmax.u8 d6, d6, \tmp2 237cabdff1aSopenharmony_ci.endif 238cabdff1aSopenharmony_ci vsubl.u8 \tmpq1, d22, d25 @ p1 - q1 239cabdff1aSopenharmony_ci vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0)) 240cabdff1aSopenharmony_ci vsubl.u8 \tmpq2, d24, d23 @ q0 - p0 241cabdff1aSopenharmony_ci vmov.s16 \tmpq3, #3 242cabdff1aSopenharmony_ci.if \wd == 8 243cabdff1aSopenharmony_ci vcle.u8 d6, d6, d0 @ flat8in 244cabdff1aSopenharmony_ci.endif 245cabdff1aSopenharmony_ci vcle.u8 d5, d5, d3 @ !hev 246cabdff1aSopenharmony_ci.if \wd == 8 247cabdff1aSopenharmony_ci vand d6, d6, d4 @ flat8in && fm 248cabdff1aSopenharmony_ci.endif 249cabdff1aSopenharmony_ci vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1) 250cabdff1aSopenharmony_ci.if \wd == 16 251cabdff1aSopenharmony_ci vcle.u8 d7, d7, d0 @ flat8out 252cabdff1aSopenharmony_ci.elseif \wd == 8 253cabdff1aSopenharmony_ci vbic d4, d4, d6 @ fm && !flat8in 254cabdff1aSopenharmony_ci.endif 255cabdff1aSopenharmony_ci vand d5, d5, d4 @ !hev && fm && !flat8in 256cabdff1aSopenharmony_ci.if \wd == 16 257cabdff1aSopenharmony_ci vand d7, d7, d6 @ flat8out && flat8in && fm 258cabdff1aSopenharmony_ci.endif 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0) 261cabdff1aSopenharmony_ci vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0 262cabdff1aSopenharmony_ci vmov.s8 d2, #4 263cabdff1aSopenharmony_ci vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 264cabdff1aSopenharmony_ci vmov.s8 d3, #3 265cabdff1aSopenharmony_ci vqmovn.s16 \tmp1, \tmpq2 @ f 266cabdff1aSopenharmony_ci.if \wd == 16 267cabdff1aSopenharmony_ci vbic d6, d6, d7 @ fm && flat8in && !flat8out 268cabdff1aSopenharmony_ci.endif 269cabdff1aSopenharmony_ci 270cabdff1aSopenharmony_ci vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127) 271cabdff1aSopenharmony_ci vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127) 272cabdff1aSopenharmony_ci vmovl.u8 q0, d23 @ p0 273cabdff1aSopenharmony_ci vshr.s8 \tmp3, \tmp3, #3 @ f1 274cabdff1aSopenharmony_ci vshr.s8 \tmp4, \tmp4, #3 @ f2 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci vmovl.u8 q1, d24 @ q0 277cabdff1aSopenharmony_ci vaddw.s8 q0, q0, \tmp4 @ p0 + f2 278cabdff1aSopenharmony_ci vsubw.s8 q1, q1, \tmp3 @ q0 - f1 279cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 @ out p0 280cabdff1aSopenharmony_ci vqmovun.s16 d1, q1 @ out q0 281cabdff1aSopenharmony_ci vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1 282cabdff1aSopenharmony_ci vbit d23, d0, d4 @ if (fm && !flat8in) 283cabdff1aSopenharmony_ci vbit d24, d1, d4 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci vmovl.u8 q0, d22 @ p1 286cabdff1aSopenharmony_ci vmovl.u8 q1, d25 @ q1 287cabdff1aSopenharmony_ci.if \wd >= 8 288cabdff1aSopenharmony_ci vmov r2, r3, d6 289cabdff1aSopenharmony_ci.endif 290cabdff1aSopenharmony_ci vaddw.s8 q0, q0, \tmp3 @ p1 + f 291cabdff1aSopenharmony_ci vsubw.s8 q1, q1, \tmp3 @ q1 - f 292cabdff1aSopenharmony_ci.if \wd >= 8 293cabdff1aSopenharmony_ci orrs r2, r2, r3 294cabdff1aSopenharmony_ci.endif 295cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 @ out p1 296cabdff1aSopenharmony_ci vqmovun.s16 d2, q1 @ out q1 297cabdff1aSopenharmony_ci vbit d22, d0, d5 @ if (!hev && fm && !flat8in) 298cabdff1aSopenharmony_ci vbit d25, d2, d5 299cabdff1aSopenharmony_ci 300cabdff1aSopenharmony_ci.if \wd >= 8 301cabdff1aSopenharmony_ci @ If no pixels need flat8in, jump to flat8out 302cabdff1aSopenharmony_ci @ (or to a writeout of the inner 4 pixels, for wd=8) 303cabdff1aSopenharmony_ci beq 6f 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ci @ flat8in 306cabdff1aSopenharmony_ci vaddl.u8 \tmpq1, d20, d21 307cabdff1aSopenharmony_ci vaddl.u8 \tmpq2, d22, d25 308cabdff1aSopenharmony_ci vaddl.u8 \tmpq3, d20, d22 309cabdff1aSopenharmony_ci vaddl.u8 \tmpq4, d23, d26 310cabdff1aSopenharmony_ci vadd.u16 q0, \tmpq1, \tmpq1 311cabdff1aSopenharmony_ci vaddw.u8 q0, q0, d23 312cabdff1aSopenharmony_ci vaddw.u8 q0, q0, d24 313cabdff1aSopenharmony_ci vadd.u16 q0, q0, \tmpq3 314cabdff1aSopenharmony_ci vsub.s16 \tmpq2, \tmpq2, \tmpq1 315cabdff1aSopenharmony_ci vsub.s16 \tmpq4, \tmpq4, \tmpq3 316cabdff1aSopenharmony_ci vrshrn.u16 d2, q0, #3 @ out p2 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci vadd.u16 q0, q0, \tmpq2 319cabdff1aSopenharmony_ci vaddl.u8 \tmpq1, d20, d23 320cabdff1aSopenharmony_ci vaddl.u8 \tmpq2, d24, d27 321cabdff1aSopenharmony_ci vrshrn.u16 d3, q0, #3 @ out p1 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci vadd.u16 q0, q0, \tmpq4 324cabdff1aSopenharmony_ci vsub.s16 \tmpq2, \tmpq2, \tmpq1 325cabdff1aSopenharmony_ci vaddl.u8 \tmpq3, d21, d24 326cabdff1aSopenharmony_ci vaddl.u8 \tmpq4, d25, d27 327cabdff1aSopenharmony_ci vrshrn.u16 d4, q0, #3 @ out p0 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci vadd.u16 q0, q0, \tmpq2 330cabdff1aSopenharmony_ci vsub.s16 \tmpq4, \tmpq4, \tmpq3 331cabdff1aSopenharmony_ci vaddl.u8 \tmpq1, d22, d25 332cabdff1aSopenharmony_ci vaddl.u8 \tmpq2, d26, d27 333cabdff1aSopenharmony_ci vrshrn.u16 d5, q0, #3 @ out q0 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci vadd.u16 q0, q0, \tmpq4 336cabdff1aSopenharmony_ci vsub.s16 \tmpq2, \tmpq2, \tmpq1 337cabdff1aSopenharmony_ci vrshrn.u16 \tmp5, q0, #3 @ out q1 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ci vadd.u16 q0, q0, \tmpq2 340cabdff1aSopenharmony_ci @ The output here is written back into the input registers. This doesn't 341cabdff1aSopenharmony_ci @ matter for the flat8out part below, since we only update those pixels 342cabdff1aSopenharmony_ci @ which won't be touched below. 343cabdff1aSopenharmony_ci vbit d21, d2, d6 344cabdff1aSopenharmony_ci vbit d22, d3, d6 345cabdff1aSopenharmony_ci vbit d23, d4, d6 346cabdff1aSopenharmony_ci vrshrn.u16 \tmp6, q0, #3 @ out q2 347cabdff1aSopenharmony_ci vbit d24, d5, d6 348cabdff1aSopenharmony_ci vbit d25, \tmp5, d6 349cabdff1aSopenharmony_ci vbit d26, \tmp6, d6 350cabdff1aSopenharmony_ci.endif 351cabdff1aSopenharmony_ci.if \wd == 16 352cabdff1aSopenharmony_ci6: 353cabdff1aSopenharmony_ci vorr d2, d6, d7 354cabdff1aSopenharmony_ci vmov r2, r3, d2 355cabdff1aSopenharmony_ci orrs r2, r2, r3 356cabdff1aSopenharmony_ci @ If no pixels needed flat8in nor flat8out, jump to a 357cabdff1aSopenharmony_ci @ writeout of the inner 4 pixels 358cabdff1aSopenharmony_ci beq 7f 359cabdff1aSopenharmony_ci vmov r2, r3, d7 360cabdff1aSopenharmony_ci orrs r2, r2, r3 361cabdff1aSopenharmony_ci @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels 362cabdff1aSopenharmony_ci beq 8f 363cabdff1aSopenharmony_ci 364cabdff1aSopenharmony_ci @ flat8out 365cabdff1aSopenharmony_ci @ This writes all outputs into d2-d17 (skipping d6 and d16). 366cabdff1aSopenharmony_ci @ If this part is skipped, the output is read from d21-d26 (which is the input 367cabdff1aSopenharmony_ci @ to this section). 368cabdff1aSopenharmony_ci vshll.u8 q0, d16, #3 @ 8 * d16 369cabdff1aSopenharmony_ci vsubw.u8 q0, q0, d16 @ 7 * d16 370cabdff1aSopenharmony_ci vaddw.u8 q0, q0, d17 371cabdff1aSopenharmony_ci vaddl.u8 q4, d17, d18 372cabdff1aSopenharmony_ci vaddl.u8 q5, d19, d20 373cabdff1aSopenharmony_ci vadd.s16 q0, q0, q4 374cabdff1aSopenharmony_ci vaddl.u8 q4, d16, d17 375cabdff1aSopenharmony_ci vaddl.u8 q6, d21, d22 376cabdff1aSopenharmony_ci vadd.s16 q0, q0, q5 377cabdff1aSopenharmony_ci vaddl.u8 q5, d18, d25 378cabdff1aSopenharmony_ci vaddl.u8 q7, d23, d24 379cabdff1aSopenharmony_ci vsub.s16 q5, q5, q4 380cabdff1aSopenharmony_ci vadd.s16 q0, q0, q6 381cabdff1aSopenharmony_ci vadd.s16 q0, q0, q7 382cabdff1aSopenharmony_ci vaddl.u8 q6, d16, d18 383cabdff1aSopenharmony_ci vaddl.u8 q7, d19, d26 384cabdff1aSopenharmony_ci vrshrn.u16 d2, q0, #4 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci vadd.s16 q0, q0, q5 387cabdff1aSopenharmony_ci vaddl.u8 q4, d16, d19 388cabdff1aSopenharmony_ci vaddl.u8 q5, d20, d27 389cabdff1aSopenharmony_ci vsub.s16 q7, q7, q6 390cabdff1aSopenharmony_ci vbif d2, d17, d7 391cabdff1aSopenharmony_ci vrshrn.u16 d3, q0, #4 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci vadd.s16 q0, q0, q7 394cabdff1aSopenharmony_ci vaddl.u8 q6, d16, d20 395cabdff1aSopenharmony_ci vaddl.u8 q7, d21, d28 396cabdff1aSopenharmony_ci vsub.s16 q5, q5, q4 397cabdff1aSopenharmony_ci vbif d3, d18, d7 398cabdff1aSopenharmony_ci vrshrn.u16 d4, q0, #4 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci vadd.s16 q0, q0, q5 401cabdff1aSopenharmony_ci vaddl.u8 q4, d16, d21 402cabdff1aSopenharmony_ci vaddl.u8 q5, d22, d29 403cabdff1aSopenharmony_ci vsub.s16 q7, q7, q6 404cabdff1aSopenharmony_ci vbif d4, d19, d7 405cabdff1aSopenharmony_ci vrshrn.u16 d5, q0, #4 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci vadd.s16 q0, q0, q7 408cabdff1aSopenharmony_ci vaddl.u8 q6, d16, d22 409cabdff1aSopenharmony_ci vaddl.u8 q7, d23, d30 410cabdff1aSopenharmony_ci vsub.s16 q5, q5, q4 411cabdff1aSopenharmony_ci vbif d5, d20, d7 412cabdff1aSopenharmony_ci vrshrn.u16 d6, q0, #4 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci vadd.s16 q0, q0, q5 415cabdff1aSopenharmony_ci vaddl.u8 q5, d16, d23 416cabdff1aSopenharmony_ci vsub.s16 q7, q7, q6 417cabdff1aSopenharmony_ci vaddl.u8 q6, d24, d31 418cabdff1aSopenharmony_ci vbif d6, d21, d7 419cabdff1aSopenharmony_ci vrshrn.u16 d8, q0, #4 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci vadd.s16 q0, q0, q7 422cabdff1aSopenharmony_ci vsub.s16 q5, q6, q5 423cabdff1aSopenharmony_ci vaddl.u8 q6, d17, d24 424cabdff1aSopenharmony_ci vaddl.u8 q7, d25, d31 425cabdff1aSopenharmony_ci vbif d8, d22, d7 426cabdff1aSopenharmony_ci vrshrn.u16 d9, q0, #4 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci vadd.s16 q0, q0, q5 429cabdff1aSopenharmony_ci vsub.s16 q7, q7, q6 430cabdff1aSopenharmony_ci vaddl.u8 q6, d26, d31 431cabdff1aSopenharmony_ci vbif d9, d23, d7 432cabdff1aSopenharmony_ci vrshrn.u16 d10, q0, #4 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci vadd.s16 q0, q0, q7 435cabdff1aSopenharmony_ci vaddl.u8 q7, d18, d25 436cabdff1aSopenharmony_ci vaddl.u8 q9, d19, d26 437cabdff1aSopenharmony_ci vsub.s16 q6, q6, q7 438cabdff1aSopenharmony_ci vaddl.u8 q7, d27, d31 439cabdff1aSopenharmony_ci vbif d10, d24, d7 440cabdff1aSopenharmony_ci vrshrn.u16 d11, q0, #4 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci vadd.s16 q0, q0, q6 443cabdff1aSopenharmony_ci vaddl.u8 q6, d20, d27 444cabdff1aSopenharmony_ci vsub.s16 q7, q7, q9 445cabdff1aSopenharmony_ci vaddl.u8 q9, d28, d31 446cabdff1aSopenharmony_ci vbif d11, d25, d7 447cabdff1aSopenharmony_ci vsub.s16 q9, q9, q6 448cabdff1aSopenharmony_ci vrshrn.u16 d12, q0, #4 449cabdff1aSopenharmony_ci 450cabdff1aSopenharmony_ci vadd.s16 q0, q0, q7 451cabdff1aSopenharmony_ci vaddl.u8 q7, d21, d28 452cabdff1aSopenharmony_ci vaddl.u8 q10, d29, d31 453cabdff1aSopenharmony_ci vbif d12, d26, d7 454cabdff1aSopenharmony_ci vrshrn.u16 d13, q0, #4 455cabdff1aSopenharmony_ci 456cabdff1aSopenharmony_ci vadd.s16 q0, q0, q9 457cabdff1aSopenharmony_ci vsub.s16 q10, q10, q7 458cabdff1aSopenharmony_ci vaddl.u8 q9, d22, d29 459cabdff1aSopenharmony_ci vaddl.u8 q11, d30, d31 460cabdff1aSopenharmony_ci vbif d13, d27, d7 461cabdff1aSopenharmony_ci vrshrn.u16 d14, q0, #4 462cabdff1aSopenharmony_ci 463cabdff1aSopenharmony_ci vadd.s16 q0, q0, q10 464cabdff1aSopenharmony_ci vsub.s16 q11, q11, q9 465cabdff1aSopenharmony_ci vbif d14, d28, d7 466cabdff1aSopenharmony_ci vrshrn.u16 d15, q0, #4 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci vadd.s16 q0, q0, q11 469cabdff1aSopenharmony_ci vbif d15, d29, d7 470cabdff1aSopenharmony_ci vrshrn.u16 d17, q0, #4 471cabdff1aSopenharmony_ci vbif d17, d30, d7 472cabdff1aSopenharmony_ci.endif 473cabdff1aSopenharmony_ci.endm 474cabdff1aSopenharmony_ci 475cabdff1aSopenharmony_ci@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers, 476cabdff1aSopenharmony_ci@ while we need those for inputs/outputs in wd=16 and use d8-d15 477cabdff1aSopenharmony_ci@ for temp registers there instead. 478cabdff1aSopenharmony_ci.macro loop_filter_4 479cabdff1aSopenharmony_ci loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15 480cabdff1aSopenharmony_ci.endm 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_ci.macro loop_filter_8 483cabdff1aSopenharmony_ci loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15 484cabdff1aSopenharmony_ci.endm 485cabdff1aSopenharmony_ci 486cabdff1aSopenharmony_ci.macro loop_filter_16 487cabdff1aSopenharmony_ci loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7 488cabdff1aSopenharmony_ci.endm 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature: 492cabdff1aSopenharmony_ci@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 493cabdff1aSopenharmony_ci 494cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_4_8_neon, export=1 495cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #2 496cabdff1aSopenharmony_ci vld1.8 {d20}, [r12,:64], r1 @ p3 497cabdff1aSopenharmony_ci vld1.8 {d24}, [r0, :64], r1 @ q0 498cabdff1aSopenharmony_ci vld1.8 {d21}, [r12,:64], r1 @ p2 499cabdff1aSopenharmony_ci vld1.8 {d25}, [r0, :64], r1 @ q1 500cabdff1aSopenharmony_ci vld1.8 {d22}, [r12,:64], r1 @ p1 501cabdff1aSopenharmony_ci vld1.8 {d26}, [r0, :64], r1 @ q2 502cabdff1aSopenharmony_ci vld1.8 {d23}, [r12,:64], r1 @ p0 503cabdff1aSopenharmony_ci vld1.8 {d27}, [r0, :64], r1 @ q3 504cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 505cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #1 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci loop_filter_4 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci vst1.8 {d22}, [r12,:64], r1 510cabdff1aSopenharmony_ci vst1.8 {d24}, [r0, :64], r1 511cabdff1aSopenharmony_ci vst1.8 {d23}, [r12,:64], r1 512cabdff1aSopenharmony_ci vst1.8 {d25}, [r0, :64], r1 513cabdff1aSopenharmony_ci9: 514cabdff1aSopenharmony_ci bx lr 515cabdff1aSopenharmony_ciendfunc 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_4_8_neon, export=1 518cabdff1aSopenharmony_ci sub r12, r0, #4 519cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 520cabdff1aSopenharmony_ci vld1.8 {d20}, [r12], r1 521cabdff1aSopenharmony_ci vld1.8 {d24}, [r0], r1 522cabdff1aSopenharmony_ci vld1.8 {d21}, [r12], r1 523cabdff1aSopenharmony_ci vld1.8 {d25}, [r0], r1 524cabdff1aSopenharmony_ci vld1.8 {d22}, [r12], r1 525cabdff1aSopenharmony_ci vld1.8 {d26}, [r0], r1 526cabdff1aSopenharmony_ci vld1.8 {d23}, [r12], r1 527cabdff1aSopenharmony_ci vld1.8 {d27}, [r0], r1 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 530cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 531cabdff1aSopenharmony_ci @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the 532cabdff1aSopenharmony_ci @ outermost 2 pixels since they aren't changed. 533cabdff1aSopenharmony_ci add r12, r12, #2 534cabdff1aSopenharmony_ci add r0, r0, #2 535cabdff1aSopenharmony_ci 536cabdff1aSopenharmony_ci @ Transpose the 8x8 pixels, taking advantage of q registers, to get 537cabdff1aSopenharmony_ci @ one register per column. 538cabdff1aSopenharmony_ci transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_ci loop_filter_4 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci @ We only will write the mid 4 pixels back; after the loop filter, 543cabdff1aSopenharmony_ci @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows 544cabdff1aSopenharmony_ci @ (8x4 pixels). We need to transpose them to columns, done with a 545cabdff1aSopenharmony_ci @ 4x4 transpose (which in practice is two 4x4 transposes of the two 546cabdff1aSopenharmony_ci @ 4x4 halves of the 8x4 pixels; into 4x8 pixels). 547cabdff1aSopenharmony_ci transpose_q_4x4 q11, q12, d22, d23, d24, d25 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci vst1.32 {d22[0]}, [r12], r1 550cabdff1aSopenharmony_ci vst1.32 {d22[1]}, [r0], r1 551cabdff1aSopenharmony_ci vst1.32 {d23[0]}, [r12], r1 552cabdff1aSopenharmony_ci vst1.32 {d23[1]}, [r0], r1 553cabdff1aSopenharmony_ci vst1.32 {d24[0]}, [r12], r1 554cabdff1aSopenharmony_ci vst1.32 {d24[1]}, [r0], r1 555cabdff1aSopenharmony_ci vst1.32 {d25[0]}, [r12], r1 556cabdff1aSopenharmony_ci vst1.32 {d25[1]}, [r0], r1 557cabdff1aSopenharmony_ci9: 558cabdff1aSopenharmony_ci bx lr 559cabdff1aSopenharmony_ciendfunc 560cabdff1aSopenharmony_ci 561cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_44_16_neon, export=1 562cabdff1aSopenharmony_ci vpush {q4-q7} 563cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #2 564cabdff1aSopenharmony_ci vld1.8 {q8}, [r12,:128], r1 @ p3 565cabdff1aSopenharmony_ci vld1.8 {q12}, [r0, :128], r1 @ q0 566cabdff1aSopenharmony_ci vld1.8 {q9}, [r12,:128], r1 @ p2 567cabdff1aSopenharmony_ci vld1.8 {q13}, [r0, :128], r1 @ q1 568cabdff1aSopenharmony_ci vld1.8 {q10}, [r12,:128], r1 @ p1 569cabdff1aSopenharmony_ci vld1.8 {q14}, [r0, :128], r1 @ q2 570cabdff1aSopenharmony_ci vld1.8 {q11}, [r12,:128], r1 @ p0 571cabdff1aSopenharmony_ci vld1.8 {q15}, [r0, :128], r1 @ q3 572cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 573cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #1 574cabdff1aSopenharmony_ci 575cabdff1aSopenharmony_ci loop_filter_q 576cabdff1aSopenharmony_ci 577cabdff1aSopenharmony_ci vst1.8 {q10}, [r12,:128], r1 578cabdff1aSopenharmony_ci vst1.8 {q12}, [r0, :128], r1 579cabdff1aSopenharmony_ci vst1.8 {q11}, [r12,:128], r1 580cabdff1aSopenharmony_ci vst1.8 {q13}, [r0, :128], r1 581cabdff1aSopenharmony_ci9: 582cabdff1aSopenharmony_ci vpop {q4-q7} 583cabdff1aSopenharmony_ci bx lr 584cabdff1aSopenharmony_ciendfunc 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_44_16_neon, export=1 587cabdff1aSopenharmony_ci vpush {q4-q7} 588cabdff1aSopenharmony_ci sub r12, r0, #4 589cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 590cabdff1aSopenharmony_ci vld1.8 {d16}, [r12], r1 591cabdff1aSopenharmony_ci vld1.8 {d24}, [r0], r1 592cabdff1aSopenharmony_ci vld1.8 {d18}, [r12], r1 593cabdff1aSopenharmony_ci vld1.8 {d26}, [r0], r1 594cabdff1aSopenharmony_ci vld1.8 {d20}, [r12], r1 595cabdff1aSopenharmony_ci vld1.8 {d28}, [r0], r1 596cabdff1aSopenharmony_ci vld1.8 {d22}, [r12], r1 597cabdff1aSopenharmony_ci vld1.8 {d30}, [r0], r1 598cabdff1aSopenharmony_ci mov r12, r0 599cabdff1aSopenharmony_ci add r0, r0, r1, lsl #2 600cabdff1aSopenharmony_ci vld1.8 {d17}, [r12], r1 601cabdff1aSopenharmony_ci vld1.8 {d25}, [r0], r1 602cabdff1aSopenharmony_ci vld1.8 {d19}, [r12], r1 603cabdff1aSopenharmony_ci vld1.8 {d27}, [r0], r1 604cabdff1aSopenharmony_ci vld1.8 {d21}, [r12], r1 605cabdff1aSopenharmony_ci vld1.8 {d29}, [r0], r1 606cabdff1aSopenharmony_ci vld1.8 {d23}, [r12], r1 607cabdff1aSopenharmony_ci vld1.8 {d31}, [r0], r1 608cabdff1aSopenharmony_ci 609cabdff1aSopenharmony_ci @ Transpose the 16x8 pixels, as two 8x8 parts 610cabdff1aSopenharmony_ci transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci loop_filter_q 613cabdff1aSopenharmony_ci 614cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #4 615cabdff1aSopenharmony_ci add r0, r12, r1, lsl #3 616cabdff1aSopenharmony_ci @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the 617cabdff1aSopenharmony_ci @ outermost 2 pixels since they aren't changed. 618cabdff1aSopenharmony_ci add r12, r12, #2 619cabdff1aSopenharmony_ci add r0, r0, #2 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci @ We only will write the mid 4 pixels back; after the loop filter, 622cabdff1aSopenharmony_ci @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels). 623cabdff1aSopenharmony_ci @ We need to transpose them to columns, done with a 4x4 transpose 624cabdff1aSopenharmony_ci @ (which in practice is four 4x4 transposes of the 4x4 blocks of 625cabdff1aSopenharmony_ci @ the 16x4 pixels; into 4x16 pixels). 626cabdff1aSopenharmony_ci transpose_4x4 q10, q11, q12, q13 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_ci vst1.32 {d20[0]}, [r12], r1 629cabdff1aSopenharmony_ci vst1.32 {d21[0]}, [r0], r1 630cabdff1aSopenharmony_ci vst1.32 {d22[0]}, [r12], r1 631cabdff1aSopenharmony_ci vst1.32 {d23[0]}, [r0], r1 632cabdff1aSopenharmony_ci vst1.32 {d24[0]}, [r12], r1 633cabdff1aSopenharmony_ci vst1.32 {d25[0]}, [r0], r1 634cabdff1aSopenharmony_ci vst1.32 {d26[0]}, [r12], r1 635cabdff1aSopenharmony_ci vst1.32 {d27[0]}, [r0], r1 636cabdff1aSopenharmony_ci vst1.32 {d20[1]}, [r12], r1 637cabdff1aSopenharmony_ci vst1.32 {d21[1]}, [r0], r1 638cabdff1aSopenharmony_ci vst1.32 {d22[1]}, [r12], r1 639cabdff1aSopenharmony_ci vst1.32 {d23[1]}, [r0], r1 640cabdff1aSopenharmony_ci vst1.32 {d24[1]}, [r12], r1 641cabdff1aSopenharmony_ci vst1.32 {d25[1]}, [r0], r1 642cabdff1aSopenharmony_ci vst1.32 {d26[1]}, [r12], r1 643cabdff1aSopenharmony_ci vst1.32 {d27[1]}, [r0], r1 644cabdff1aSopenharmony_ci9: 645cabdff1aSopenharmony_ci vpop {q4-q7} 646cabdff1aSopenharmony_ci bx lr 647cabdff1aSopenharmony_ciendfunc 648cabdff1aSopenharmony_ci 649cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_8_8_neon, export=1 650cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #2 651cabdff1aSopenharmony_ci vld1.8 {d20}, [r12,:64], r1 @ p3 652cabdff1aSopenharmony_ci vld1.8 {d24}, [r0, :64], r1 @ q0 653cabdff1aSopenharmony_ci vld1.8 {d21}, [r12,:64], r1 @ p2 654cabdff1aSopenharmony_ci vld1.8 {d25}, [r0, :64], r1 @ q1 655cabdff1aSopenharmony_ci vld1.8 {d22}, [r12,:64], r1 @ p1 656cabdff1aSopenharmony_ci vld1.8 {d26}, [r0, :64], r1 @ q2 657cabdff1aSopenharmony_ci vld1.8 {d23}, [r12,:64], r1 @ p0 658cabdff1aSopenharmony_ci vld1.8 {d27}, [r0, :64], r1 @ q3 659cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 660cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 661cabdff1aSopenharmony_ci add r12, r12, r1 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci loop_filter_8 664cabdff1aSopenharmony_ci 665cabdff1aSopenharmony_ci vst1.8 {d21}, [r12,:64], r1 666cabdff1aSopenharmony_ci vst1.8 {d24}, [r0, :64], r1 667cabdff1aSopenharmony_ci vst1.8 {d22}, [r12,:64], r1 668cabdff1aSopenharmony_ci vst1.8 {d25}, [r0, :64], r1 669cabdff1aSopenharmony_ci vst1.8 {d23}, [r12,:64], r1 670cabdff1aSopenharmony_ci vst1.8 {d26}, [r0, :64], r1 671cabdff1aSopenharmony_ci9: 672cabdff1aSopenharmony_ci bx lr 673cabdff1aSopenharmony_ci6: 674cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #1 675cabdff1aSopenharmony_ci vst1.8 {d22}, [r12,:64], r1 676cabdff1aSopenharmony_ci vst1.8 {d24}, [r0, :64], r1 677cabdff1aSopenharmony_ci vst1.8 {d23}, [r12,:64], r1 678cabdff1aSopenharmony_ci vst1.8 {d25}, [r0, :64], r1 679cabdff1aSopenharmony_ci bx lr 680cabdff1aSopenharmony_ciendfunc 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_8_8_neon, export=1 683cabdff1aSopenharmony_ci sub r12, r0, #4 684cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 685cabdff1aSopenharmony_ci vld1.8 {d20}, [r12], r1 686cabdff1aSopenharmony_ci vld1.8 {d24}, [r0], r1 687cabdff1aSopenharmony_ci vld1.8 {d21}, [r12], r1 688cabdff1aSopenharmony_ci vld1.8 {d25}, [r0], r1 689cabdff1aSopenharmony_ci vld1.8 {d22}, [r12], r1 690cabdff1aSopenharmony_ci vld1.8 {d26}, [r0], r1 691cabdff1aSopenharmony_ci vld1.8 {d23}, [r12], r1 692cabdff1aSopenharmony_ci vld1.8 {d27}, [r0], r1 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #2 695cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci loop_filter_8 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci @ Even though only 6 pixels per row have been changed, we write the 702cabdff1aSopenharmony_ci @ full 8 pixel registers. 703cabdff1aSopenharmony_ci transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci vst1.8 {d20}, [r12], r1 706cabdff1aSopenharmony_ci vst1.8 {d24}, [r0], r1 707cabdff1aSopenharmony_ci vst1.8 {d21}, [r12], r1 708cabdff1aSopenharmony_ci vst1.8 {d25}, [r0], r1 709cabdff1aSopenharmony_ci vst1.8 {d22}, [r12], r1 710cabdff1aSopenharmony_ci vst1.8 {d26}, [r0], r1 711cabdff1aSopenharmony_ci vst1.8 {d23}, [r12], r1 712cabdff1aSopenharmony_ci vst1.8 {d27}, [r0], r1 713cabdff1aSopenharmony_ci9: 714cabdff1aSopenharmony_ci bx lr 715cabdff1aSopenharmony_ci6: 716cabdff1aSopenharmony_ci @ If we didn't need to do the flat8in part, we use the same writeback 717cabdff1aSopenharmony_ci @ as in loop_filter_h_4_8. 718cabdff1aSopenharmony_ci add r12, r12, #2 719cabdff1aSopenharmony_ci add r0, r0, #2 720cabdff1aSopenharmony_ci transpose_q_4x4 q11, q12, d22, d23, d24, d25 721cabdff1aSopenharmony_ci vst1.32 {d22[0]}, [r12], r1 722cabdff1aSopenharmony_ci vst1.32 {d22[1]}, [r0], r1 723cabdff1aSopenharmony_ci vst1.32 {d23[0]}, [r12], r1 724cabdff1aSopenharmony_ci vst1.32 {d23[1]}, [r0], r1 725cabdff1aSopenharmony_ci vst1.32 {d24[0]}, [r12], r1 726cabdff1aSopenharmony_ci vst1.32 {d24[1]}, [r0], r1 727cabdff1aSopenharmony_ci vst1.32 {d25[0]}, [r12], r1 728cabdff1aSopenharmony_ci vst1.32 {d25[1]}, [r0], r1 729cabdff1aSopenharmony_ci bx lr 730cabdff1aSopenharmony_ciendfunc 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_cifunction vp9_loop_filter_v_16_neon 733cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #3 734cabdff1aSopenharmony_ci @ Read p7-p0 using r12 and q0-q7 using r0 735cabdff1aSopenharmony_ci vld1.8 {d16}, [r12,:64], r1 @ p7 736cabdff1aSopenharmony_ci vld1.8 {d24}, [r0, :64], r1 @ q0 737cabdff1aSopenharmony_ci vld1.8 {d17}, [r12,:64], r1 @ p6 738cabdff1aSopenharmony_ci vld1.8 {d25}, [r0, :64], r1 @ q1 739cabdff1aSopenharmony_ci vld1.8 {d18}, [r12,:64], r1 @ p5 740cabdff1aSopenharmony_ci vld1.8 {d26}, [r0, :64], r1 @ q2 741cabdff1aSopenharmony_ci vld1.8 {d19}, [r12,:64], r1 @ p4 742cabdff1aSopenharmony_ci vld1.8 {d27}, [r0, :64], r1 @ q3 743cabdff1aSopenharmony_ci vld1.8 {d20}, [r12,:64], r1 @ p3 744cabdff1aSopenharmony_ci vld1.8 {d28}, [r0, :64], r1 @ q4 745cabdff1aSopenharmony_ci vld1.8 {d21}, [r12,:64], r1 @ p2 746cabdff1aSopenharmony_ci vld1.8 {d29}, [r0, :64], r1 @ q5 747cabdff1aSopenharmony_ci vld1.8 {d22}, [r12,:64], r1 @ p1 748cabdff1aSopenharmony_ci vld1.8 {d30}, [r0, :64], r1 @ q6 749cabdff1aSopenharmony_ci vld1.8 {d23}, [r12,:64], r1 @ p0 750cabdff1aSopenharmony_ci vld1.8 {d31}, [r0, :64], r1 @ q7 751cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #3 752cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 753cabdff1aSopenharmony_ci add r12, r12, r1 754cabdff1aSopenharmony_ci 755cabdff1aSopenharmony_ci loop_filter_16 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_ci @ If we did the flat8out part, we get the output in 758cabdff1aSopenharmony_ci @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride, 759cabdff1aSopenharmony_ci @ store d2-d9 there, and d10-d17 into r0. 760cabdff1aSopenharmony_ci vst1.8 {d2}, [r12,:64], r1 761cabdff1aSopenharmony_ci vst1.8 {d10}, [r0, :64], r1 762cabdff1aSopenharmony_ci vst1.8 {d3}, [r12,:64], r1 763cabdff1aSopenharmony_ci vst1.8 {d11}, [r0, :64], r1 764cabdff1aSopenharmony_ci vst1.8 {d4}, [r12,:64], r1 765cabdff1aSopenharmony_ci vst1.8 {d12}, [r0, :64], r1 766cabdff1aSopenharmony_ci vst1.8 {d5}, [r12,:64], r1 767cabdff1aSopenharmony_ci vst1.8 {d13}, [r0, :64], r1 768cabdff1aSopenharmony_ci vst1.8 {d6}, [r12,:64], r1 769cabdff1aSopenharmony_ci vst1.8 {d14}, [r0, :64], r1 770cabdff1aSopenharmony_ci vst1.8 {d8}, [r12,:64], r1 771cabdff1aSopenharmony_ci vst1.8 {d15}, [r0, :64], r1 772cabdff1aSopenharmony_ci vst1.8 {d9}, [r12,:64], r1 773cabdff1aSopenharmony_ci vst1.8 {d17}, [r0, :64], r1 774cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 775cabdff1aSopenharmony_ci add r0, r0, r1 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci9: 778cabdff1aSopenharmony_ci bx lr 779cabdff1aSopenharmony_ci 780cabdff1aSopenharmony_ci8: 781cabdff1aSopenharmony_ci add r12, r12, r1, lsl #2 782cabdff1aSopenharmony_ci @ If we didn't do the flat8out part, the output is left in the 783cabdff1aSopenharmony_ci @ input registers. 784cabdff1aSopenharmony_ci vst1.8 {d21}, [r12,:64], r1 785cabdff1aSopenharmony_ci vst1.8 {d24}, [r0, :64], r1 786cabdff1aSopenharmony_ci vst1.8 {d22}, [r12,:64], r1 787cabdff1aSopenharmony_ci vst1.8 {d25}, [r0, :64], r1 788cabdff1aSopenharmony_ci vst1.8 {d23}, [r12,:64], r1 789cabdff1aSopenharmony_ci vst1.8 {d26}, [r0, :64], r1 790cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 791cabdff1aSopenharmony_ci sub r0, r0, r1 792cabdff1aSopenharmony_ci bx lr 793cabdff1aSopenharmony_ci7: 794cabdff1aSopenharmony_ci sub r12, r0, r1, lsl #1 795cabdff1aSopenharmony_ci vst1.8 {d22}, [r12,:64], r1 796cabdff1aSopenharmony_ci vst1.8 {d24}, [r0, :64], r1 797cabdff1aSopenharmony_ci vst1.8 {d23}, [r12,:64], r1 798cabdff1aSopenharmony_ci vst1.8 {d25}, [r0, :64], r1 799cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 800cabdff1aSopenharmony_ci bx lr 801cabdff1aSopenharmony_ciendfunc 802cabdff1aSopenharmony_ci 803cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_8_neon, export=1 804cabdff1aSopenharmony_ci ldr r12, [sp] 805cabdff1aSopenharmony_ci push {lr} 806cabdff1aSopenharmony_ci vpush {q4-q7} 807cabdff1aSopenharmony_ci push {r12} 808cabdff1aSopenharmony_ci bl vp9_loop_filter_v_16_neon 809cabdff1aSopenharmony_ci add sp, sp, #4 810cabdff1aSopenharmony_ci vpop {q4-q7} 811cabdff1aSopenharmony_ci pop {pc} 812cabdff1aSopenharmony_ciendfunc 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_16_neon, export=1 815cabdff1aSopenharmony_ci ldr r12, [sp] 816cabdff1aSopenharmony_ci // The filter clobbers r2 and r3, but we need to keep them for the second round 817cabdff1aSopenharmony_ci push {r2, r3, lr} 818cabdff1aSopenharmony_ci vpush {q4-q7} 819cabdff1aSopenharmony_ci push {r12} 820cabdff1aSopenharmony_ci bl vp9_loop_filter_v_16_neon 821cabdff1aSopenharmony_ci add r0, #8 822cabdff1aSopenharmony_ci ldr r2, [sp, #68] 823cabdff1aSopenharmony_ci ldr r3, [sp, #72] 824cabdff1aSopenharmony_ci bl vp9_loop_filter_v_16_neon 825cabdff1aSopenharmony_ci add sp, sp, #4 826cabdff1aSopenharmony_ci vpop {q4-q7} 827cabdff1aSopenharmony_ci pop {r2, r3, pc} 828cabdff1aSopenharmony_ciendfunc 829cabdff1aSopenharmony_ci 830cabdff1aSopenharmony_cifunction vp9_loop_filter_h_16_neon 831cabdff1aSopenharmony_ci sub r12, r0, #8 832cabdff1aSopenharmony_ci vld1.8 {d16}, [r12,:64], r1 833cabdff1aSopenharmony_ci vld1.8 {d24}, [r0, :64], r1 834cabdff1aSopenharmony_ci vld1.8 {d17}, [r12,:64], r1 835cabdff1aSopenharmony_ci vld1.8 {d25}, [r0, :64], r1 836cabdff1aSopenharmony_ci vld1.8 {d18}, [r12,:64], r1 837cabdff1aSopenharmony_ci vld1.8 {d26}, [r0, :64], r1 838cabdff1aSopenharmony_ci vld1.8 {d19}, [r12,:64], r1 839cabdff1aSopenharmony_ci vld1.8 {d27}, [r0, :64], r1 840cabdff1aSopenharmony_ci vld1.8 {d20}, [r12,:64], r1 841cabdff1aSopenharmony_ci vld1.8 {d28}, [r0, :64], r1 842cabdff1aSopenharmony_ci vld1.8 {d21}, [r12,:64], r1 843cabdff1aSopenharmony_ci vld1.8 {d29}, [r0, :64], r1 844cabdff1aSopenharmony_ci vld1.8 {d22}, [r12,:64], r1 845cabdff1aSopenharmony_ci vld1.8 {d30}, [r0, :64], r1 846cabdff1aSopenharmony_ci vld1.8 {d23}, [r12,:64], r1 847cabdff1aSopenharmony_ci vld1.8 {d31}, [r0, :64], r1 848cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 849cabdff1aSopenharmony_ci sub r12, r12, r1, lsl #3 850cabdff1aSopenharmony_ci 851cabdff1aSopenharmony_ci @ The 16x8 pixels read above is in two 8x8 blocks; the left 852cabdff1aSopenharmony_ci @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes 853cabdff1aSopenharmony_ci @ of this, to get one column per register. This could be done with two 854cabdff1aSopenharmony_ci @ transpose_8x8 as below, but this takes advantage of the q registers. 855cabdff1aSopenharmony_ci transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15 856cabdff1aSopenharmony_ci vtrn.8 d16, d17 857cabdff1aSopenharmony_ci vtrn.8 d18, d19 858cabdff1aSopenharmony_ci vtrn.8 d20, d21 859cabdff1aSopenharmony_ci vtrn.8 d22, d23 860cabdff1aSopenharmony_ci vtrn.8 d24, d25 861cabdff1aSopenharmony_ci vtrn.8 d26, d27 862cabdff1aSopenharmony_ci vtrn.8 d28, d29 863cabdff1aSopenharmony_ci vtrn.8 d30, d31 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ci loop_filter_16 866cabdff1aSopenharmony_ci 867cabdff1aSopenharmony_ci @ Transpose back; this is the same transpose as above, but 868cabdff1aSopenharmony_ci @ we can't take advantage of q registers for the transpose, since 869cabdff1aSopenharmony_ci @ all d registers in the transpose aren't consecutive. 870cabdff1aSopenharmony_ci transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9 871cabdff1aSopenharmony_ci transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31 872cabdff1aSopenharmony_ci 873cabdff1aSopenharmony_ci vst1.8 {d16}, [r12,:64], r1 874cabdff1aSopenharmony_ci vst1.8 {d10}, [r0, :64], r1 875cabdff1aSopenharmony_ci 876cabdff1aSopenharmony_ci vst1.8 {d2}, [r12,:64], r1 877cabdff1aSopenharmony_ci vst1.8 {d11}, [r0, :64], r1 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_ci vst1.8 {d3}, [r12,:64], r1 880cabdff1aSopenharmony_ci vst1.8 {d12}, [r0, :64], r1 881cabdff1aSopenharmony_ci 882cabdff1aSopenharmony_ci vst1.8 {d4}, [r12,:64], r1 883cabdff1aSopenharmony_ci vst1.8 {d13}, [r0, :64], r1 884cabdff1aSopenharmony_ci 885cabdff1aSopenharmony_ci vst1.8 {d5}, [r12,:64], r1 886cabdff1aSopenharmony_ci vst1.8 {d14}, [r0, :64], r1 887cabdff1aSopenharmony_ci 888cabdff1aSopenharmony_ci vst1.8 {d6}, [r12,:64], r1 889cabdff1aSopenharmony_ci vst1.8 {d15}, [r0, :64], r1 890cabdff1aSopenharmony_ci 891cabdff1aSopenharmony_ci vst1.8 {d8}, [r12,:64], r1 892cabdff1aSopenharmony_ci vst1.8 {d17}, [r0, :64], r1 893cabdff1aSopenharmony_ci 894cabdff1aSopenharmony_ci vst1.8 {d9}, [r12,:64], r1 895cabdff1aSopenharmony_ci vst1.8 {d31}, [r0, :64], r1 896cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 897cabdff1aSopenharmony_ci9: 898cabdff1aSopenharmony_ci bx lr 899cabdff1aSopenharmony_ci8: 900cabdff1aSopenharmony_ci @ The same writeback as in loop_filter_h_8_8 901cabdff1aSopenharmony_ci sub r12, r0, #4 902cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 903cabdff1aSopenharmony_ci transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 904cabdff1aSopenharmony_ci 905cabdff1aSopenharmony_ci vst1.8 {d20}, [r12], r1 906cabdff1aSopenharmony_ci vst1.8 {d24}, [r0], r1 907cabdff1aSopenharmony_ci vst1.8 {d21}, [r12], r1 908cabdff1aSopenharmony_ci vst1.8 {d25}, [r0], r1 909cabdff1aSopenharmony_ci vst1.8 {d22}, [r12], r1 910cabdff1aSopenharmony_ci vst1.8 {d26}, [r0], r1 911cabdff1aSopenharmony_ci vst1.8 {d23}, [r12], r1 912cabdff1aSopenharmony_ci vst1.8 {d27}, [r0], r1 913cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 914cabdff1aSopenharmony_ci add r0, r0, #4 915cabdff1aSopenharmony_ci bx lr 916cabdff1aSopenharmony_ci7: 917cabdff1aSopenharmony_ci @ The same writeback as in loop_filter_h_4_8 918cabdff1aSopenharmony_ci sub r12, r0, #2 919cabdff1aSopenharmony_ci add r0, r12, r1, lsl #2 920cabdff1aSopenharmony_ci transpose_q_4x4 q11, q12, d22, d23, d24, d25 921cabdff1aSopenharmony_ci vst1.32 {d22[0]}, [r12], r1 922cabdff1aSopenharmony_ci vst1.32 {d22[1]}, [r0], r1 923cabdff1aSopenharmony_ci vst1.32 {d23[0]}, [r12], r1 924cabdff1aSopenharmony_ci vst1.32 {d23[1]}, [r0], r1 925cabdff1aSopenharmony_ci vst1.32 {d24[0]}, [r12], r1 926cabdff1aSopenharmony_ci vst1.32 {d24[1]}, [r0], r1 927cabdff1aSopenharmony_ci vst1.32 {d25[0]}, [r12], r1 928cabdff1aSopenharmony_ci vst1.32 {d25[1]}, [r0], r1 929cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #3 930cabdff1aSopenharmony_ci add r0, r0, #2 931cabdff1aSopenharmony_ci bx lr 932cabdff1aSopenharmony_ciendfunc 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_8_neon, export=1 935cabdff1aSopenharmony_ci ldr r12, [sp] 936cabdff1aSopenharmony_ci push {lr} 937cabdff1aSopenharmony_ci vpush {q4-q7} 938cabdff1aSopenharmony_ci push {r12} 939cabdff1aSopenharmony_ci bl vp9_loop_filter_h_16_neon 940cabdff1aSopenharmony_ci add sp, sp, #4 941cabdff1aSopenharmony_ci vpop {q4-q7} 942cabdff1aSopenharmony_ci pop {pc} 943cabdff1aSopenharmony_ciendfunc 944cabdff1aSopenharmony_ci 945cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_16_neon, export=1 946cabdff1aSopenharmony_ci ldr r12, [sp] 947cabdff1aSopenharmony_ci // The filter clobbers r2 and r3, but we need to keep them for the second round 948cabdff1aSopenharmony_ci push {r2, r3, lr} 949cabdff1aSopenharmony_ci vpush {q4-q7} 950cabdff1aSopenharmony_ci push {r12} 951cabdff1aSopenharmony_ci bl vp9_loop_filter_h_16_neon 952cabdff1aSopenharmony_ci add r0, r0, r1, lsl #3 953cabdff1aSopenharmony_ci ldr r2, [sp, #68] 954cabdff1aSopenharmony_ci ldr r3, [sp, #72] 955cabdff1aSopenharmony_ci bl vp9_loop_filter_h_16_neon 956cabdff1aSopenharmony_ci add sp, sp, #4 957cabdff1aSopenharmony_ci vpop {q4-q7} 958cabdff1aSopenharmony_ci pop {r2, r3, pc} 959cabdff1aSopenharmony_ciendfunc 960