1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci#include "neon.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci// The input to and output from this macro is in the registers v16-v31, 26cabdff1aSopenharmony_ci// and v0-v7 are used as scratch registers. 27cabdff1aSopenharmony_ci// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 28cabdff1aSopenharmony_ci// Depending on the width of the loop filter, we either use v16-v19 29cabdff1aSopenharmony_ci// and v28-v31 as temp registers, or v8-v15. 30cabdff1aSopenharmony_ci.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 31cabdff1aSopenharmony_ci dup v0.8h, w2 // E 32cabdff1aSopenharmony_ci dup v2.8h, w3 // I 33cabdff1aSopenharmony_ci dup v3.8h, w4 // H 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2) 36cabdff1aSopenharmony_ci uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1) 37cabdff1aSopenharmony_ci uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0) 38cabdff1aSopenharmony_ci uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1) 39cabdff1aSopenharmony_ci uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2) 40cabdff1aSopenharmony_ci uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3) 41cabdff1aSopenharmony_ci umax v4.8h, v4.8h, v5.8h 42cabdff1aSopenharmony_ci umax v5.8h, v6.8h, v7.8h 43cabdff1aSopenharmony_ci umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h 44cabdff1aSopenharmony_ci uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0) 45cabdff1aSopenharmony_ci umax v4.8h, v4.8h, v5.8h 46cabdff1aSopenharmony_ci add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2 47cabdff1aSopenharmony_ci uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1) 48cabdff1aSopenharmony_ci umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3)) 49cabdff1aSopenharmony_ci ushr v5.8h, v5.8h, #1 50cabdff1aSopenharmony_ci cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I 51cabdff1aSopenharmony_ci add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 52cabdff1aSopenharmony_ci cmhs v6.8h, v0.8h, v6.8h 53cabdff1aSopenharmony_ci and v4.16b, v4.16b, v6.16b // fm 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci // If no pixels need filtering, just exit as soon as possible 56cabdff1aSopenharmony_ci mov x11, v4.d[0] 57cabdff1aSopenharmony_ci mov x12, v4.d[1] 58cabdff1aSopenharmony_ci adds x11, x11, x12 59cabdff1aSopenharmony_ci b.ne 1f 60cabdff1aSopenharmony_ci ret x10 61cabdff1aSopenharmony_ci1: 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci.if \wd >= 8 64cabdff1aSopenharmony_ci dup v0.8h, w5 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) 67cabdff1aSopenharmony_ci uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) 68cabdff1aSopenharmony_ci uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0) 69cabdff1aSopenharmony_ci uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0) 70cabdff1aSopenharmony_ci uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0) 71cabdff1aSopenharmony_ci uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0) 72cabdff1aSopenharmony_ci umax v6.8h, v6.8h, v2.8h 73cabdff1aSopenharmony_ci umax v1.8h, v1.8h, \tmp1\().8h 74cabdff1aSopenharmony_ci umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h 75cabdff1aSopenharmony_ci.if \wd == 16 76cabdff1aSopenharmony_ci uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0) 77cabdff1aSopenharmony_ci umax v6.8h, v6.8h, v1.8h 78cabdff1aSopenharmony_ci uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0) 79cabdff1aSopenharmony_ci umax v6.8h, v6.8h, \tmp2\().8h 80cabdff1aSopenharmony_ci uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0) 81cabdff1aSopenharmony_ci cmhs v6.8h, v0.8h, v6.8h // flat8in 82cabdff1aSopenharmony_ci uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0) 83cabdff1aSopenharmony_ci and v6.16b, v6.16b, v4.16b // flat8in && fm 84cabdff1aSopenharmony_ci uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0) 85cabdff1aSopenharmony_ci bic v4.16b, v4.16b, v6.16b // fm && !flat8in 86cabdff1aSopenharmony_ci uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0) 87cabdff1aSopenharmony_ci uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0) 88cabdff1aSopenharmony_ci uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0) 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci umax v7.8h, v7.8h, v2.8h 91cabdff1aSopenharmony_ci umax v1.8h, v1.8h, v8.8h 92cabdff1aSopenharmony_ci umax v9.8h, v9.8h, v10.8h 93cabdff1aSopenharmony_ci umax v11.8h, v11.8h, v12.8h 94cabdff1aSopenharmony_ci // The rest of the calculation of flat8out is interleaved below 95cabdff1aSopenharmony_ci.else 96cabdff1aSopenharmony_ci // The rest of the calculation of flat8in is interleaved below 97cabdff1aSopenharmony_ci.endif 98cabdff1aSopenharmony_ci.endif 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci // Calculate the normal inner loop filter for 2 or 4 pixels 101cabdff1aSopenharmony_ci uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0) 102cabdff1aSopenharmony_ci.if \wd == 16 103cabdff1aSopenharmony_ci umax v7.8h, v7.8h, v1.8h 104cabdff1aSopenharmony_ci umax v9.8h, v9.8h, v11.8h 105cabdff1aSopenharmony_ci.elseif \wd == 8 106cabdff1aSopenharmony_ci umax v6.8h, v6.8h, v1.8h 107cabdff1aSopenharmony_ci.endif 108cabdff1aSopenharmony_ci uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) 109cabdff1aSopenharmony_ci.if \wd == 16 110cabdff1aSopenharmony_ci umax v7.8h, v7.8h, v9.8h 111cabdff1aSopenharmony_ci.elseif \wd == 8 112cabdff1aSopenharmony_ci umax v6.8h, v6.8h, \tmp2\().8h 113cabdff1aSopenharmony_ci.endif 114cabdff1aSopenharmony_ci dup \tmp2\().8h, w6 // left shift for saturation 115cabdff1aSopenharmony_ci sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1 116cabdff1aSopenharmony_ci neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation 117cabdff1aSopenharmony_ci umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) 118cabdff1aSopenharmony_ci sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0 119cabdff1aSopenharmony_ci movi \tmp5\().8h, #3 120cabdff1aSopenharmony_ci.if \wd == 8 121cabdff1aSopenharmony_ci cmhs v6.8h, v0.8h, v6.8h // flat8in 122cabdff1aSopenharmony_ci.endif 123cabdff1aSopenharmony_ci cmhs v5.8h, v3.8h, v5.8h // !hev 124cabdff1aSopenharmony_ci.if \wd == 8 125cabdff1aSopenharmony_ci and v6.16b, v6.16b, v4.16b // flat8in && fm 126cabdff1aSopenharmony_ci.endif 127cabdff1aSopenharmony_ci sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h 128cabdff1aSopenharmony_ci.if \wd == 16 129cabdff1aSopenharmony_ci cmhs v7.8h, v0.8h, v7.8h // flat8out 130cabdff1aSopenharmony_ci.elseif \wd == 8 131cabdff1aSopenharmony_ci bic v4.16b, v4.16b, v6.16b // fm && !flat8in 132cabdff1aSopenharmony_ci.endif 133cabdff1aSopenharmony_ci and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in 134cabdff1aSopenharmony_ci.if \wd == 16 135cabdff1aSopenharmony_ci and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm 136cabdff1aSopenharmony_ci.endif 137cabdff1aSopenharmony_ci sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0) 140cabdff1aSopenharmony_ci bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0 141cabdff1aSopenharmony_ci movi v2.8h, #4 142cabdff1aSopenharmony_ci add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 143cabdff1aSopenharmony_ci movi v3.8h, #3 144cabdff1aSopenharmony_ci sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h 145cabdff1aSopenharmony_ci movi \tmp5\().8h, #0 146cabdff1aSopenharmony_ci sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 147cabdff1aSopenharmony_ci dup \tmp6\().8h, w7 // max pixel value 148cabdff1aSopenharmony_ci.if \wd == 16 149cabdff1aSopenharmony_ci bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out 150cabdff1aSopenharmony_ci.endif 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1 153cabdff1aSopenharmony_ci 154cabdff1aSopenharmony_ci add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4 155cabdff1aSopenharmony_ci add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3 156cabdff1aSopenharmony_ci smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 157cabdff1aSopenharmony_ci smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 158cabdff1aSopenharmony_ci sshr \tmp3\().8h, \tmp3\().8h, #3 // f1 159cabdff1aSopenharmony_ci sshr \tmp4\().8h, \tmp4\().8h, #3 // f2 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci add v0.8h, v23.8h, \tmp4\().8h // p0 + f2 162cabdff1aSopenharmony_ci sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1 163cabdff1aSopenharmony_ci smin v0.8h, v0.8h, \tmp6\().8h 164cabdff1aSopenharmony_ci smin v2.8h, v2.8h, \tmp6\().8h 165cabdff1aSopenharmony_ci srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1 166cabdff1aSopenharmony_ci smax v0.8h, v0.8h, \tmp5\().8h // out p0 167cabdff1aSopenharmony_ci smax v2.8h, v2.8h, \tmp5\().8h // out q0 168cabdff1aSopenharmony_ci bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in) 169cabdff1aSopenharmony_ci bit v24.16b, v2.16b, v4.16b 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci add v0.8h, v22.8h, \tmp3\().8h // p1 + f 172cabdff1aSopenharmony_ci sub v2.8h, v25.8h, \tmp3\().8h // q1 - f 173cabdff1aSopenharmony_ci.if \wd >= 8 174cabdff1aSopenharmony_ci mov x11, v6.d[0] 175cabdff1aSopenharmony_ci.endif 176cabdff1aSopenharmony_ci smin v0.8h, v0.8h, \tmp6\().8h 177cabdff1aSopenharmony_ci smin v2.8h, v2.8h, \tmp6\().8h 178cabdff1aSopenharmony_ci.if \wd >= 8 179cabdff1aSopenharmony_ci mov x12, v6.d[1] 180cabdff1aSopenharmony_ci.endif 181cabdff1aSopenharmony_ci smax v0.8h, v0.8h, \tmp5\().8h // out p1 182cabdff1aSopenharmony_ci smax v2.8h, v2.8h, \tmp5\().8h // out q1 183cabdff1aSopenharmony_ci.if \wd >= 8 184cabdff1aSopenharmony_ci adds x11, x11, x12 185cabdff1aSopenharmony_ci.endif 186cabdff1aSopenharmony_ci bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in) 187cabdff1aSopenharmony_ci bit v25.16b, v2.16b, v5.16b 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci // If no pixels need flat8in, jump to flat8out 190cabdff1aSopenharmony_ci // (or to a writeout of the inner 4 pixels, for wd=8) 191cabdff1aSopenharmony_ci.if \wd >= 8 192cabdff1aSopenharmony_ci.if \wd == 16 193cabdff1aSopenharmony_ci b.eq 6f 194cabdff1aSopenharmony_ci.else 195cabdff1aSopenharmony_ci b.ne 1f 196cabdff1aSopenharmony_ci ret x13 197cabdff1aSopenharmony_ci1: 198cabdff1aSopenharmony_ci.endif 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci // flat8in 201cabdff1aSopenharmony_ci add \tmp1\().8h, v20.8h, v21.8h 202cabdff1aSopenharmony_ci add \tmp3\().8h, v22.8h, v25.8h 203cabdff1aSopenharmony_ci add \tmp5\().8h, v20.8h, v22.8h 204cabdff1aSopenharmony_ci add \tmp7\().8h, v23.8h, v26.8h 205cabdff1aSopenharmony_ci add v0.8h, \tmp1\().8h, \tmp1\().8h 206cabdff1aSopenharmony_ci add v0.8h, v0.8h, v23.8h 207cabdff1aSopenharmony_ci add v0.8h, v0.8h, v24.8h 208cabdff1aSopenharmony_ci add v0.8h, v0.8h, \tmp5\().8h 209cabdff1aSopenharmony_ci sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 210cabdff1aSopenharmony_ci sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h 211cabdff1aSopenharmony_ci urshr v2.8h, v0.8h, #3 // out p2 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci add v0.8h, v0.8h, \tmp3\().8h 214cabdff1aSopenharmony_ci add \tmp1\().8h, v20.8h, v23.8h 215cabdff1aSopenharmony_ci add \tmp3\().8h, v24.8h, v27.8h 216cabdff1aSopenharmony_ci urshr v3.8h, v0.8h, #3 // out p1 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci add v0.8h, v0.8h, \tmp7\().8h 219cabdff1aSopenharmony_ci sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 220cabdff1aSopenharmony_ci add \tmp5\().8h, v21.8h, v24.8h 221cabdff1aSopenharmony_ci add \tmp7\().8h, v25.8h, v27.8h 222cabdff1aSopenharmony_ci urshr v4.8h, v0.8h, #3 // out p0 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci add v0.8h, v0.8h, \tmp3\().8h 225cabdff1aSopenharmony_ci sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h 226cabdff1aSopenharmony_ci add \tmp1\().8h, v22.8h, v25.8h 227cabdff1aSopenharmony_ci add \tmp3\().8h, v26.8h, v27.8h 228cabdff1aSopenharmony_ci urshr v5.8h, v0.8h, #3 // out q0 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci add v0.8h, v0.8h, \tmp7\().8h 231cabdff1aSopenharmony_ci sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 232cabdff1aSopenharmony_ci urshr \tmp5\().8h, v0.8h, #3 // out q1 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci add v0.8h, v0.8h, \tmp3\().8h 235cabdff1aSopenharmony_ci // The output here is written back into the input registers. This doesn't 236cabdff1aSopenharmony_ci // matter for the flat8part below, since we only update those pixels 237cabdff1aSopenharmony_ci // which won't be touched below. 238cabdff1aSopenharmony_ci bit v21.16b, v2.16b, v6.16b 239cabdff1aSopenharmony_ci bit v22.16b, v3.16b, v6.16b 240cabdff1aSopenharmony_ci bit v23.16b, v4.16b, v6.16b 241cabdff1aSopenharmony_ci urshr \tmp6\().8h, v0.8h, #3 // out q2 242cabdff1aSopenharmony_ci bit v24.16b, v5.16b, v6.16b 243cabdff1aSopenharmony_ci bit v25.16b, \tmp5\().16b, v6.16b 244cabdff1aSopenharmony_ci bit v26.16b, \tmp6\().16b, v6.16b 245cabdff1aSopenharmony_ci.endif 246cabdff1aSopenharmony_ci.if \wd == 16 247cabdff1aSopenharmony_ci6: 248cabdff1aSopenharmony_ci orr v2.16b, v6.16b, v7.16b 249cabdff1aSopenharmony_ci mov x11, v2.d[0] 250cabdff1aSopenharmony_ci mov x12, v2.d[1] 251cabdff1aSopenharmony_ci adds x11, x11, x12 252cabdff1aSopenharmony_ci b.ne 1f 253cabdff1aSopenharmony_ci // If no pixels needed flat8in nor flat8out, jump to a 254cabdff1aSopenharmony_ci // writeout of the inner 4 pixels 255cabdff1aSopenharmony_ci ret x14 256cabdff1aSopenharmony_ci1: 257cabdff1aSopenharmony_ci 258cabdff1aSopenharmony_ci mov x11, v7.d[0] 259cabdff1aSopenharmony_ci mov x12, v7.d[1] 260cabdff1aSopenharmony_ci adds x11, x11, x12 261cabdff1aSopenharmony_ci b.ne 1f 262cabdff1aSopenharmony_ci // If no pixels need flat8out, jump to a writeout of the inner 6 pixels 263cabdff1aSopenharmony_ci ret x15 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci1: 266cabdff1aSopenharmony_ci // flat8out 267cabdff1aSopenharmony_ci // This writes all outputs into v2-v17 (skipping v6 and v16). 268cabdff1aSopenharmony_ci // If this part is skipped, the output is read from v21-v26 (which is the input 269cabdff1aSopenharmony_ci // to this section). 270cabdff1aSopenharmony_ci shl v0.8h, v16.8h, #3 // 8 * v16 271cabdff1aSopenharmony_ci sub v0.8h, v0.8h, v16.8h // 7 * v16 272cabdff1aSopenharmony_ci add v0.8h, v0.8h, v17.8h 273cabdff1aSopenharmony_ci add v8.8h, v17.8h, v18.8h 274cabdff1aSopenharmony_ci add v10.8h, v19.8h, v20.8h 275cabdff1aSopenharmony_ci add v0.8h, v0.8h, v8.8h 276cabdff1aSopenharmony_ci add v8.8h, v16.8h, v17.8h 277cabdff1aSopenharmony_ci add v12.8h, v21.8h, v22.8h 278cabdff1aSopenharmony_ci add v0.8h, v0.8h, v10.8h 279cabdff1aSopenharmony_ci add v10.8h, v18.8h, v25.8h 280cabdff1aSopenharmony_ci add v14.8h, v23.8h, v24.8h 281cabdff1aSopenharmony_ci sub v10.8h, v10.8h, v8.8h 282cabdff1aSopenharmony_ci add v0.8h, v0.8h, v12.8h 283cabdff1aSopenharmony_ci add v0.8h, v0.8h, v14.8h 284cabdff1aSopenharmony_ci add v12.8h, v16.8h, v18.8h 285cabdff1aSopenharmony_ci add v14.8h, v19.8h, v26.8h 286cabdff1aSopenharmony_ci urshr v2.8h, v0.8h, #4 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci add v0.8h, v0.8h, v10.8h 289cabdff1aSopenharmony_ci add v8.8h, v16.8h, v19.8h 290cabdff1aSopenharmony_ci add v10.8h, v20.8h, v27.8h 291cabdff1aSopenharmony_ci sub v14.8h, v14.8h, v12.8h 292cabdff1aSopenharmony_ci bif v2.16b, v17.16b, v7.16b 293cabdff1aSopenharmony_ci urshr v3.8h , v0.8h, #4 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci add v0.8h, v0.8h, v14.8h 296cabdff1aSopenharmony_ci add v12.8h, v16.8h, v20.8h 297cabdff1aSopenharmony_ci add v14.8h, v21.8h, v28.8h 298cabdff1aSopenharmony_ci sub v10.8h, v10.8h, v8.8h 299cabdff1aSopenharmony_ci bif v3.16b, v18.16b, v7.16b 300cabdff1aSopenharmony_ci urshr v4.8h, v0.8h, #4 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci add v0.8h, v0.8h, v10.8h 303cabdff1aSopenharmony_ci add v8.8h, v16.8h, v21.8h 304cabdff1aSopenharmony_ci add v10.8h, v22.8h, v29.8h 305cabdff1aSopenharmony_ci sub v14.8h, v14.8h, v12.8h 306cabdff1aSopenharmony_ci bif v4.16b, v19.16b, v7.16b 307cabdff1aSopenharmony_ci urshr v5.8h, v0.8h, #4 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci add v0.8h, v0.8h, v14.8h 310cabdff1aSopenharmony_ci add v12.8h, v16.8h, v22.8h 311cabdff1aSopenharmony_ci add v14.8h, v23.8h, v30.8h 312cabdff1aSopenharmony_ci sub v10.8h, v10.8h, v8.8h 313cabdff1aSopenharmony_ci bif v5.16b, v20.16b, v7.16b 314cabdff1aSopenharmony_ci urshr v6.8h, v0.8h, #4 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci add v0.8h, v0.8h, v10.8h 317cabdff1aSopenharmony_ci add v10.8h, v16.8h, v23.8h 318cabdff1aSopenharmony_ci sub v14.8h, v14.8h, v12.8h 319cabdff1aSopenharmony_ci add v12.8h, v24.8h, v31.8h 320cabdff1aSopenharmony_ci bif v6.16b, v21.16b, v7.16b 321cabdff1aSopenharmony_ci urshr v8.8h, v0.8h, #4 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci add v0.8h, v0.8h, v14.8h 324cabdff1aSopenharmony_ci sub v10.8h, v12.8h, v10.8h 325cabdff1aSopenharmony_ci add v12.8h, v17.8h, v24.8h 326cabdff1aSopenharmony_ci add v14.8h, v25.8h, v31.8h 327cabdff1aSopenharmony_ci bif v8.16b, v22.16b, v7.16b 328cabdff1aSopenharmony_ci urshr v9.8h, v0.8h, #4 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_ci add v0.8h, v0.8h, v10.8h 331cabdff1aSopenharmony_ci sub v14.8h, v14.8h, v12.8h 332cabdff1aSopenharmony_ci add v12.8h, v26.8h, v31.8h 333cabdff1aSopenharmony_ci bif v9.16b, v23.16b, v7.16b 334cabdff1aSopenharmony_ci urshr v10.8h, v0.8h, #4 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci add v0.8h, v0.8h, v14.8h 337cabdff1aSopenharmony_ci add v14.8h, v18.8h, v25.8h 338cabdff1aSopenharmony_ci add v18.8h, v19.8h, v26.8h 339cabdff1aSopenharmony_ci sub v12.8h, v12.8h, v14.8h 340cabdff1aSopenharmony_ci add v14.8h, v27.8h, v31.8h 341cabdff1aSopenharmony_ci bif v10.16b, v24.16b, v7.16b 342cabdff1aSopenharmony_ci urshr v11.8h, v0.8h, #4 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci add v0.8h, v0.8h, v12.8h 345cabdff1aSopenharmony_ci add v12.8h, v20.8h, v27.8h 346cabdff1aSopenharmony_ci sub v14.8h, v14.8h, v18.8h 347cabdff1aSopenharmony_ci add v18.8h, v28.8h, v31.8h 348cabdff1aSopenharmony_ci bif v11.16b, v25.16b, v7.16b 349cabdff1aSopenharmony_ci sub v18.8h, v18.8h, v12.8h 350cabdff1aSopenharmony_ci urshr v12.8h, v0.8h, #4 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci add v0.8h, v0.8h, v14.8h 353cabdff1aSopenharmony_ci add v14.8h, v21.8h, v28.8h 354cabdff1aSopenharmony_ci add v20.8h, v29.8h, v31.8h 355cabdff1aSopenharmony_ci bif v12.16b, v26.16b, v7.16b 356cabdff1aSopenharmony_ci urshr v13.8h, v0.8h, #4 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci add v0.8h, v0.8h, v18.8h 359cabdff1aSopenharmony_ci sub v20.8h, v20.8h, v14.8h 360cabdff1aSopenharmony_ci add v18.8h, v22.8h, v29.8h 361cabdff1aSopenharmony_ci add v22.8h, v30.8h, v31.8h 362cabdff1aSopenharmony_ci bif v13.16b, v27.16b, v7.16b 363cabdff1aSopenharmony_ci urshr v14.8h, v0.8h, #4 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_ci add v0.8h, v0.8h, v20.8h 366cabdff1aSopenharmony_ci sub v22.8h, v22.8h, v18.8h 367cabdff1aSopenharmony_ci bif v14.16b, v28.16b, v7.16b 368cabdff1aSopenharmony_ci urshr v15.8h, v0.8h, #4 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci add v0.8h, v0.8h, v22.8h 371cabdff1aSopenharmony_ci bif v15.16b, v29.16b, v7.16b 372cabdff1aSopenharmony_ci urshr v17.8h, v0.8h, #4 373cabdff1aSopenharmony_ci bif v17.16b, v30.16b, v7.16b 374cabdff1aSopenharmony_ci.endif 375cabdff1aSopenharmony_ci.endm 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, 378cabdff1aSopenharmony_ci// while we need those for inputs/outputs in wd=16 and use v8-v15 379cabdff1aSopenharmony_ci// for temp registers there instead. 380cabdff1aSopenharmony_cifunction vp9_loop_filter_4 381cabdff1aSopenharmony_ci loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31 382cabdff1aSopenharmony_ci ret 383cabdff1aSopenharmony_ciendfunc 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_cifunction vp9_loop_filter_8 386cabdff1aSopenharmony_ci loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31 387cabdff1aSopenharmony_ci ret 388cabdff1aSopenharmony_ciendfunc 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_cifunction vp9_loop_filter_16 391cabdff1aSopenharmony_ci loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15 392cabdff1aSopenharmony_ci ret 393cabdff1aSopenharmony_ciendfunc 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci.macro loop_filter_4 396cabdff1aSopenharmony_ci bl vp9_loop_filter_4 397cabdff1aSopenharmony_ci.endm 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci.macro loop_filter_8 400cabdff1aSopenharmony_ci // calculate alternative 'return' targets 401cabdff1aSopenharmony_ci adr x13, 6f 402cabdff1aSopenharmony_ci bl vp9_loop_filter_8 403cabdff1aSopenharmony_ci.endm 404cabdff1aSopenharmony_ci 405cabdff1aSopenharmony_ci.macro loop_filter_16 406cabdff1aSopenharmony_ci // calculate alternative 'return' targets 407cabdff1aSopenharmony_ci adr x14, 7f 408cabdff1aSopenharmony_ci adr x15, 8f 409cabdff1aSopenharmony_ci bl vp9_loop_filter_16 410cabdff1aSopenharmony_ci.endm 411cabdff1aSopenharmony_ci 412cabdff1aSopenharmony_ci 413cabdff1aSopenharmony_ci// The public functions in this file have got the following signature: 414cabdff1aSopenharmony_ci// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci.macro bpp_frontend func, bpp, push 417cabdff1aSopenharmony_cifunction ff_\func\()_\bpp\()_neon, export=1 418cabdff1aSopenharmony_ci.if \push 419cabdff1aSopenharmony_ci mov x16, x30 420cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 421cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 422cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 423cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 424cabdff1aSopenharmony_ci.endif 425cabdff1aSopenharmony_ci lsl w2, w2, #\bpp - 8 426cabdff1aSopenharmony_ci lsl w3, w3, #\bpp - 8 427cabdff1aSopenharmony_ci lsl w4, w4, #\bpp - 8 428cabdff1aSopenharmony_ci mov x5, #1 << (\bpp - 8) 429cabdff1aSopenharmony_ci mov x6, #16 - \bpp 430cabdff1aSopenharmony_ci mov x7, #((1 << \bpp) - 1) 431cabdff1aSopenharmony_ci.if \push 432cabdff1aSopenharmony_ci bl \func\()_16_neon 433cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 434cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 435cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 436cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 437cabdff1aSopenharmony_ci ret x16 438cabdff1aSopenharmony_ci.else 439cabdff1aSopenharmony_ci b \func\()_16_neon 440cabdff1aSopenharmony_ci.endif 441cabdff1aSopenharmony_ciendfunc 442cabdff1aSopenharmony_ci.endm 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci.macro bpp_frontends func, push=0 445cabdff1aSopenharmony_ci bpp_frontend \func, 10, \push 446cabdff1aSopenharmony_ci bpp_frontend \func, 12, \push 447cabdff1aSopenharmony_ci.endm 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push 450cabdff1aSopenharmony_cifunction ff_\func\()_\suffix\()_\bpp\()_neon, export=1 451cabdff1aSopenharmony_ci mov x16, x30 452cabdff1aSopenharmony_ci.if \push 453cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 454cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 455cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 456cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 457cabdff1aSopenharmony_ci.endif 458cabdff1aSopenharmony_ci lsl w2, w2, #\bpp - 8 459cabdff1aSopenharmony_ci lsl w3, w3, #\bpp - 8 460cabdff1aSopenharmony_ci lsl w4, w4, #\bpp - 8 461cabdff1aSopenharmony_ci mov x5, #1 << (\bpp - 8) 462cabdff1aSopenharmony_ci mov x6, #16 - \bpp 463cabdff1aSopenharmony_ci mov x7, #((1 << \bpp) - 1) 464cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 465cabdff1aSopenharmony_ci.ifc \dir,h 466cabdff1aSopenharmony_ci add x0, x0, x1, lsl #3 467cabdff1aSopenharmony_ci.else 468cabdff1aSopenharmony_ci add x0, x0, #16 469cabdff1aSopenharmony_ci.endif 470cabdff1aSopenharmony_ci bl \func\()_\int_suffix\()_16_neon 471cabdff1aSopenharmony_ci.if \push 472cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 473cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 474cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 475cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 476cabdff1aSopenharmony_ci.endif 477cabdff1aSopenharmony_ci ret x16 478cabdff1aSopenharmony_ciendfunc 479cabdff1aSopenharmony_ci.endm 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0 482cabdff1aSopenharmony_ci bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push 483cabdff1aSopenharmony_ci bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push 484cabdff1aSopenharmony_ci.endm 485cabdff1aSopenharmony_ci 486cabdff1aSopenharmony_ci.macro bpp_frontend_mix2 wd1, wd2, dir, bpp 487cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 488cabdff1aSopenharmony_ci mov x16, x30 489cabdff1aSopenharmony_ci lsr w8, w2, #8 490cabdff1aSopenharmony_ci lsr w14, w3, #8 491cabdff1aSopenharmony_ci lsr w15, w4, #8 492cabdff1aSopenharmony_ci and w2, w2, #0xff 493cabdff1aSopenharmony_ci and w3, w3, #0xff 494cabdff1aSopenharmony_ci and w4, w4, #0xff 495cabdff1aSopenharmony_ci lsl w2, w2, #\bpp - 8 496cabdff1aSopenharmony_ci lsl w3, w3, #\bpp - 8 497cabdff1aSopenharmony_ci lsl w4, w4, #\bpp - 8 498cabdff1aSopenharmony_ci mov x5, #1 << (\bpp - 8) 499cabdff1aSopenharmony_ci mov x6, #16 - \bpp 500cabdff1aSopenharmony_ci mov x7, #((1 << \bpp) - 1) 501cabdff1aSopenharmony_ci bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon 502cabdff1aSopenharmony_ci.ifc \dir,h 503cabdff1aSopenharmony_ci add x0, x0, x1, lsl #3 504cabdff1aSopenharmony_ci.else 505cabdff1aSopenharmony_ci add x0, x0, #16 506cabdff1aSopenharmony_ci.endif 507cabdff1aSopenharmony_ci lsl w2, w8, #\bpp - 8 508cabdff1aSopenharmony_ci lsl w3, w14, #\bpp - 8 509cabdff1aSopenharmony_ci lsl w4, w15, #\bpp - 8 510cabdff1aSopenharmony_ci bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon 511cabdff1aSopenharmony_ci ret x16 512cabdff1aSopenharmony_ciendfunc 513cabdff1aSopenharmony_ci.endm 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci.macro bpp_frontends_mix2 wd1, wd2 516cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, v, 10 517cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, v, 12 518cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, h, 10 519cabdff1aSopenharmony_ci bpp_frontend_mix2 \wd1, \wd2, h, 12 520cabdff1aSopenharmony_ci.endm 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_cifunction vp9_loop_filter_v_4_8_16_neon 523cabdff1aSopenharmony_ci mov x10, x30 524cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #2 525cabdff1aSopenharmony_ci ld1 {v20.8h}, [x9], x1 // p3 526cabdff1aSopenharmony_ci ld1 {v24.8h}, [x0], x1 // q0 527cabdff1aSopenharmony_ci ld1 {v21.8h}, [x9], x1 // p2 528cabdff1aSopenharmony_ci ld1 {v25.8h}, [x0], x1 // q1 529cabdff1aSopenharmony_ci ld1 {v22.8h}, [x9], x1 // p1 530cabdff1aSopenharmony_ci ld1 {v26.8h}, [x0], x1 // q2 531cabdff1aSopenharmony_ci ld1 {v23.8h}, [x9], x1 // p0 532cabdff1aSopenharmony_ci ld1 {v27.8h}, [x0], x1 // q3 533cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 534cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #1 535cabdff1aSopenharmony_ci 536cabdff1aSopenharmony_ci loop_filter_4 537cabdff1aSopenharmony_ci 538cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 539cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 540cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 541cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 542cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci ret x10 545cabdff1aSopenharmony_ciendfunc 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_4_8 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_cifunction vp9_loop_filter_h_4_8_16_neon 550cabdff1aSopenharmony_ci mov x10, x30 551cabdff1aSopenharmony_ci sub x9, x0, #8 552cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 553cabdff1aSopenharmony_ci ld1 {v20.8h}, [x9], x1 554cabdff1aSopenharmony_ci ld1 {v24.8h}, [x0], x1 555cabdff1aSopenharmony_ci ld1 {v21.8h}, [x9], x1 556cabdff1aSopenharmony_ci ld1 {v25.8h}, [x0], x1 557cabdff1aSopenharmony_ci ld1 {v22.8h}, [x9], x1 558cabdff1aSopenharmony_ci ld1 {v26.8h}, [x0], x1 559cabdff1aSopenharmony_ci ld1 {v23.8h}, [x9], x1 560cabdff1aSopenharmony_ci ld1 {v27.8h}, [x0], x1 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 563cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 564cabdff1aSopenharmony_ci add x0, x0, #8 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci loop_filter_4 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci // Move x9 forward by 2 pixels; we don't need to rewrite the 571cabdff1aSopenharmony_ci // outermost 2 pixels since they aren't changed. 572cabdff1aSopenharmony_ci add x9, x9, #4 573cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 574cabdff1aSopenharmony_ci 575cabdff1aSopenharmony_ci // We only will write the mid 4 pixels back; after the loop filter, 576cabdff1aSopenharmony_ci // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). 577cabdff1aSopenharmony_ci // We need to transpose them to columns, done with a 4x8 transpose 578cabdff1aSopenharmony_ci // (which in practice is two 4x4 transposes of the two 4x4 halves 579cabdff1aSopenharmony_ci // of the 8x4 pixels; into 4x8 pixels). 580cabdff1aSopenharmony_ci transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 581cabdff1aSopenharmony_ci st1 {v22.d}[0], [x9], x1 582cabdff1aSopenharmony_ci st1 {v22.d}[1], [x0], x1 583cabdff1aSopenharmony_ci st1 {v23.d}[0], [x9], x1 584cabdff1aSopenharmony_ci st1 {v23.d}[1], [x0], x1 585cabdff1aSopenharmony_ci st1 {v24.d}[0], [x9], x1 586cabdff1aSopenharmony_ci st1 {v24.d}[1], [x0], x1 587cabdff1aSopenharmony_ci st1 {v25.d}[0], [x9], x1 588cabdff1aSopenharmony_ci st1 {v25.d}[1], [x0], x1 589cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 590cabdff1aSopenharmony_ci add x0, x0, #4 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ci ret x10 593cabdff1aSopenharmony_ciendfunc 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_4_8 596cabdff1aSopenharmony_ci 597cabdff1aSopenharmony_cifunction vp9_loop_filter_v_8_8_16_neon 598cabdff1aSopenharmony_ci mov x10, x30 599cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #2 600cabdff1aSopenharmony_ci ld1 {v20.8h}, [x9], x1 // p3 601cabdff1aSopenharmony_ci ld1 {v24.8h}, [x0], x1 // q0 602cabdff1aSopenharmony_ci ld1 {v21.8h}, [x9], x1 // p2 603cabdff1aSopenharmony_ci ld1 {v25.8h}, [x0], x1 // q1 604cabdff1aSopenharmony_ci ld1 {v22.8h}, [x9], x1 // p1 605cabdff1aSopenharmony_ci ld1 {v26.8h}, [x0], x1 // q2 606cabdff1aSopenharmony_ci ld1 {v23.8h}, [x9], x1 // p0 607cabdff1aSopenharmony_ci ld1 {v27.8h}, [x0], x1 // q3 608cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 609cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 610cabdff1aSopenharmony_ci add x9, x9, x1 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci loop_filter_8 613cabdff1aSopenharmony_ci 614cabdff1aSopenharmony_ci st1 {v21.8h}, [x9], x1 615cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 616cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 617cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 618cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 619cabdff1aSopenharmony_ci st1 {v26.8h}, [x0], x1 620cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 621cabdff1aSopenharmony_ci sub x0, x0, x1 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_ci ret x10 624cabdff1aSopenharmony_ci6: 625cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #1 626cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 627cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 628cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 629cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 630cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 631cabdff1aSopenharmony_ci ret x10 632cabdff1aSopenharmony_ciendfunc 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_8_8 635cabdff1aSopenharmony_ci 636cabdff1aSopenharmony_cifunction vp9_loop_filter_h_8_8_16_neon 637cabdff1aSopenharmony_ci mov x10, x30 638cabdff1aSopenharmony_ci sub x9, x0, #8 639cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 640cabdff1aSopenharmony_ci ld1 {v20.8h}, [x9], x1 641cabdff1aSopenharmony_ci ld1 {v24.8h}, [x0], x1 642cabdff1aSopenharmony_ci ld1 {v21.8h}, [x9], x1 643cabdff1aSopenharmony_ci ld1 {v25.8h}, [x0], x1 644cabdff1aSopenharmony_ci ld1 {v22.8h}, [x9], x1 645cabdff1aSopenharmony_ci ld1 {v26.8h}, [x0], x1 646cabdff1aSopenharmony_ci ld1 {v23.8h}, [x9], x1 647cabdff1aSopenharmony_ci ld1 {v27.8h}, [x0], x1 648cabdff1aSopenharmony_ci 649cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 650cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 651cabdff1aSopenharmony_ci add x0, x0, #8 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 654cabdff1aSopenharmony_ci 655cabdff1aSopenharmony_ci loop_filter_8 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_ci // Even though only 6 pixels per row have been changed, we write the 660cabdff1aSopenharmony_ci // full 8 pixel registers. 661cabdff1aSopenharmony_ci transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci st1 {v20.8h}, [x9], x1 664cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 665cabdff1aSopenharmony_ci st1 {v21.8h}, [x9], x1 666cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 667cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 668cabdff1aSopenharmony_ci st1 {v26.8h}, [x0], x1 669cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 670cabdff1aSopenharmony_ci st1 {v27.8h}, [x0], x1 671cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 672cabdff1aSopenharmony_ci add x0, x0, #8 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci ret x10 675cabdff1aSopenharmony_ci6: 676cabdff1aSopenharmony_ci // If we didn't need to do the flat8in part, we use the same writeback 677cabdff1aSopenharmony_ci // as in loop_filter_h_4_8. 678cabdff1aSopenharmony_ci add x9, x9, #4 679cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 680cabdff1aSopenharmony_ci transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 681cabdff1aSopenharmony_ci st1 {v22.d}[0], [x9], x1 682cabdff1aSopenharmony_ci st1 {v22.d}[1], [x0], x1 683cabdff1aSopenharmony_ci st1 {v23.d}[0], [x9], x1 684cabdff1aSopenharmony_ci st1 {v23.d}[1], [x0], x1 685cabdff1aSopenharmony_ci st1 {v24.d}[0], [x9], x1 686cabdff1aSopenharmony_ci st1 {v24.d}[1], [x0], x1 687cabdff1aSopenharmony_ci st1 {v25.d}[0], [x9], x1 688cabdff1aSopenharmony_ci st1 {v25.d}[1], [x0], x1 689cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 690cabdff1aSopenharmony_ci add x0, x0, #4 691cabdff1aSopenharmony_ci ret x10 692cabdff1aSopenharmony_ciendfunc 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_8_8 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_cibpp_frontends_mix2 4, 4 697cabdff1aSopenharmony_cibpp_frontends_mix2 4, 8 698cabdff1aSopenharmony_cibpp_frontends_mix2 8, 4 699cabdff1aSopenharmony_cibpp_frontends_mix2 8, 8 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_cifunction vp9_loop_filter_v_16_8_16_neon 702cabdff1aSopenharmony_ci mov x10, x30 703cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #3 704cabdff1aSopenharmony_ci ld1 {v16.8h}, [x9], x1 // p7 705cabdff1aSopenharmony_ci ld1 {v24.8h}, [x0], x1 // q0 706cabdff1aSopenharmony_ci ld1 {v17.8h}, [x9], x1 // p6 707cabdff1aSopenharmony_ci ld1 {v25.8h}, [x0], x1 // q1 708cabdff1aSopenharmony_ci ld1 {v18.8h}, [x9], x1 // p5 709cabdff1aSopenharmony_ci ld1 {v26.8h}, [x0], x1 // q2 710cabdff1aSopenharmony_ci ld1 {v19.8h}, [x9], x1 // p4 711cabdff1aSopenharmony_ci ld1 {v27.8h}, [x0], x1 // q3 712cabdff1aSopenharmony_ci ld1 {v20.8h}, [x9], x1 // p3 713cabdff1aSopenharmony_ci ld1 {v28.8h}, [x0], x1 // q4 714cabdff1aSopenharmony_ci ld1 {v21.8h}, [x9], x1 // p2 715cabdff1aSopenharmony_ci ld1 {v29.8h}, [x0], x1 // q5 716cabdff1aSopenharmony_ci ld1 {v22.8h}, [x9], x1 // p1 717cabdff1aSopenharmony_ci ld1 {v30.8h}, [x0], x1 // q6 718cabdff1aSopenharmony_ci ld1 {v23.8h}, [x9], x1 // p0 719cabdff1aSopenharmony_ci ld1 {v31.8h}, [x0], x1 // q7 720cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 721cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 722cabdff1aSopenharmony_ci add x9, x9, x1 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci loop_filter_16 725cabdff1aSopenharmony_ci 726cabdff1aSopenharmony_ci // If we did the flat8out part, we get the output in 727cabdff1aSopenharmony_ci // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, 728cabdff1aSopenharmony_ci // store v2-v9 there, and v10-v17 into x0. 729cabdff1aSopenharmony_ci st1 {v2.8h}, [x9], x1 730cabdff1aSopenharmony_ci st1 {v10.8h}, [x0], x1 731cabdff1aSopenharmony_ci st1 {v3.8h}, [x9], x1 732cabdff1aSopenharmony_ci st1 {v11.8h}, [x0], x1 733cabdff1aSopenharmony_ci st1 {v4.8h}, [x9], x1 734cabdff1aSopenharmony_ci st1 {v12.8h}, [x0], x1 735cabdff1aSopenharmony_ci st1 {v5.8h}, [x9], x1 736cabdff1aSopenharmony_ci st1 {v13.8h}, [x0], x1 737cabdff1aSopenharmony_ci st1 {v6.8h}, [x9], x1 738cabdff1aSopenharmony_ci st1 {v14.8h}, [x0], x1 739cabdff1aSopenharmony_ci st1 {v8.8h}, [x9], x1 740cabdff1aSopenharmony_ci st1 {v15.8h}, [x0], x1 741cabdff1aSopenharmony_ci st1 {v9.8h}, [x9], x1 742cabdff1aSopenharmony_ci st1 {v17.8h}, [x0], x1 743cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 744cabdff1aSopenharmony_ci add x0, x0, x1 745cabdff1aSopenharmony_ci 746cabdff1aSopenharmony_ci ret x10 747cabdff1aSopenharmony_ci8: 748cabdff1aSopenharmony_ci add x9, x9, x1, lsl #2 749cabdff1aSopenharmony_ci // If we didn't do the flat8out part, the output is left in the 750cabdff1aSopenharmony_ci // input registers. 751cabdff1aSopenharmony_ci st1 {v21.8h}, [x9], x1 752cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 753cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 754cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 755cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 756cabdff1aSopenharmony_ci st1 {v26.8h}, [x0], x1 757cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 758cabdff1aSopenharmony_ci sub x0, x0, x1 759cabdff1aSopenharmony_ci ret x10 760cabdff1aSopenharmony_ci7: 761cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #1 762cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 763cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 764cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 765cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 766cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 767cabdff1aSopenharmony_ci ret x10 768cabdff1aSopenharmony_ciendfunc 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_16_8, push=1 771cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_cifunction vp9_loop_filter_h_16_8_16_neon 774cabdff1aSopenharmony_ci mov x10, x30 775cabdff1aSopenharmony_ci sub x9, x0, #16 776cabdff1aSopenharmony_ci ld1 {v16.8h}, [x9], x1 777cabdff1aSopenharmony_ci ld1 {v24.8h}, [x0], x1 778cabdff1aSopenharmony_ci ld1 {v17.8h}, [x9], x1 779cabdff1aSopenharmony_ci ld1 {v25.8h}, [x0], x1 780cabdff1aSopenharmony_ci ld1 {v18.8h}, [x9], x1 781cabdff1aSopenharmony_ci ld1 {v26.8h}, [x0], x1 782cabdff1aSopenharmony_ci ld1 {v19.8h}, [x9], x1 783cabdff1aSopenharmony_ci ld1 {v27.8h}, [x0], x1 784cabdff1aSopenharmony_ci ld1 {v20.8h}, [x9], x1 785cabdff1aSopenharmony_ci ld1 {v28.8h}, [x0], x1 786cabdff1aSopenharmony_ci ld1 {v21.8h}, [x9], x1 787cabdff1aSopenharmony_ci ld1 {v29.8h}, [x0], x1 788cabdff1aSopenharmony_ci ld1 {v22.8h}, [x9], x1 789cabdff1aSopenharmony_ci ld1 {v30.8h}, [x0], x1 790cabdff1aSopenharmony_ci ld1 {v23.8h}, [x9], x1 791cabdff1aSopenharmony_ci ld1 {v31.8h}, [x0], x1 792cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 793cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 794cabdff1aSopenharmony_ci 795cabdff1aSopenharmony_ci // The 16x8 pixels read above is in two 8x8 blocks; the left 796cabdff1aSopenharmony_ci // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes 797cabdff1aSopenharmony_ci // of this, to get one column per register. 798cabdff1aSopenharmony_ci transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 799cabdff1aSopenharmony_ci transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_ci loop_filter_16 802cabdff1aSopenharmony_ci 803cabdff1aSopenharmony_ci transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 804cabdff1aSopenharmony_ci transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 805cabdff1aSopenharmony_ci 806cabdff1aSopenharmony_ci st1 {v16.8h}, [x9], x1 807cabdff1aSopenharmony_ci st1 {v10.8h}, [x0], x1 808cabdff1aSopenharmony_ci st1 {v2.8h}, [x9], x1 809cabdff1aSopenharmony_ci st1 {v11.8h}, [x0], x1 810cabdff1aSopenharmony_ci st1 {v3.8h}, [x9], x1 811cabdff1aSopenharmony_ci st1 {v12.8h}, [x0], x1 812cabdff1aSopenharmony_ci st1 {v4.8h}, [x9], x1 813cabdff1aSopenharmony_ci st1 {v13.8h}, [x0], x1 814cabdff1aSopenharmony_ci st1 {v5.8h}, [x9], x1 815cabdff1aSopenharmony_ci st1 {v14.8h}, [x0], x1 816cabdff1aSopenharmony_ci st1 {v6.8h}, [x9], x1 817cabdff1aSopenharmony_ci st1 {v15.8h}, [x0], x1 818cabdff1aSopenharmony_ci st1 {v8.8h}, [x9], x1 819cabdff1aSopenharmony_ci st1 {v17.8h}, [x0], x1 820cabdff1aSopenharmony_ci st1 {v9.8h}, [x9], x1 821cabdff1aSopenharmony_ci st1 {v31.8h}, [x0], x1 822cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 823cabdff1aSopenharmony_ci 824cabdff1aSopenharmony_ci ret x10 825cabdff1aSopenharmony_ci8: 826cabdff1aSopenharmony_ci // The same writeback as in loop_filter_h_8_8 827cabdff1aSopenharmony_ci sub x9, x0, #8 828cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 829cabdff1aSopenharmony_ci transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_ci st1 {v20.8h}, [x9], x1 832cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x1 833cabdff1aSopenharmony_ci st1 {v21.8h}, [x9], x1 834cabdff1aSopenharmony_ci st1 {v25.8h}, [x0], x1 835cabdff1aSopenharmony_ci st1 {v22.8h}, [x9], x1 836cabdff1aSopenharmony_ci st1 {v26.8h}, [x0], x1 837cabdff1aSopenharmony_ci st1 {v23.8h}, [x9], x1 838cabdff1aSopenharmony_ci st1 {v27.8h}, [x0], x1 839cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 840cabdff1aSopenharmony_ci add x0, x0, #8 841cabdff1aSopenharmony_ci ret x10 842cabdff1aSopenharmony_ci7: 843cabdff1aSopenharmony_ci // The same writeback as in loop_filter_h_4_8 844cabdff1aSopenharmony_ci sub x9, x0, #4 845cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 846cabdff1aSopenharmony_ci transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 847cabdff1aSopenharmony_ci st1 {v22.d}[0], [x9], x1 848cabdff1aSopenharmony_ci st1 {v22.d}[1], [x0], x1 849cabdff1aSopenharmony_ci st1 {v23.d}[0], [x9], x1 850cabdff1aSopenharmony_ci st1 {v23.d}[1], [x0], x1 851cabdff1aSopenharmony_ci st1 {v24.d}[0], [x9], x1 852cabdff1aSopenharmony_ci st1 {v24.d}[1], [x0], x1 853cabdff1aSopenharmony_ci st1 {v25.d}[0], [x9], x1 854cabdff1aSopenharmony_ci st1 {v25.d}[1], [x0], x1 855cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 856cabdff1aSopenharmony_ci add x0, x0, #4 857cabdff1aSopenharmony_ci ret x10 858cabdff1aSopenharmony_ciendfunc 859cabdff1aSopenharmony_ci 860cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_16_8, push=1 861cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1 862