1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci#include "neon.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci// The main loop filter macro is templated and can produce filters for 26cabdff1aSopenharmony_ci// vectors of 8 or 16 bytes. The register mapping throughout the filter 27cabdff1aSopenharmony_ci// is close to identical to the arm version (please try to maintain this, 28cabdff1aSopenharmony_ci// if either is changed!). When the arm version uses e.g. d20 for the 29cabdff1aSopenharmony_ci// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending 30cabdff1aSopenharmony_ci// on vector length. 31cabdff1aSopenharmony_ci// 32cabdff1aSopenharmony_ci// The number of elements in the vector is passed in via the macro parameter 33cabdff1aSopenharmony_ci// \sz, which is either .8b or .16b. For simple instructions that doesn't 34cabdff1aSopenharmony_ci// lengthen or narrow things, this can easily be templated like this: 35cabdff1aSopenharmony_ci// uabd v4\sz, v20\sz, v21\sz 36cabdff1aSopenharmony_ci// 37cabdff1aSopenharmony_ci// For instructions that lengthen or narrow content, the arm version would 38cabdff1aSopenharmony_ci// have used q registers. For these instructions, we have macros that expand 39cabdff1aSopenharmony_ci// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2 40cabdff1aSopenharmony_ci// pair, depending on the \sz parameter. Wherever the arm version would have 41cabdff1aSopenharmony_ci// used a q register, these macros instead take two v registers, i.e. q3 42cabdff1aSopenharmony_ci// is mapped to v6+v7. For the case with 8 byte input vectors, such a 43cabdff1aSopenharmony_ci// lengthening operation is only stored in v6.8h (what was in q3 in the arm 44cabdff1aSopenharmony_ci// case), while the 16 byte input vectors will use v6.8h + v7.8h. 45cabdff1aSopenharmony_ci// Such a macro invocation would look like this: 46cabdff1aSopenharmony_ci// uaddl_sz v8.8h, v9.8h, v17, v18, \sz 47cabdff1aSopenharmony_ci// 48cabdff1aSopenharmony_ci// That is, in the 8 byte input vector case, the second register in these 49cabdff1aSopenharmony_ci// register pairs will be unused. 50cabdff1aSopenharmony_ci// Unfortunately, this makes the code quite hard to read. For readability, 51cabdff1aSopenharmony_ci// see the arm version instead. 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci.macro add_sz dst1, dst2, in1, in2, in3, in4, sz 55cabdff1aSopenharmony_ci add \dst1, \in1, \in3 56cabdff1aSopenharmony_ci.ifc \sz, .16b 57cabdff1aSopenharmony_ci add \dst2, \in2, \in4 58cabdff1aSopenharmony_ci.endif 59cabdff1aSopenharmony_ci.endm 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz 62cabdff1aSopenharmony_ci sub \dst1, \in1, \in3 63cabdff1aSopenharmony_ci.ifc \sz, .16b 64cabdff1aSopenharmony_ci sub \dst2, \in2, \in4 65cabdff1aSopenharmony_ci.endif 66cabdff1aSopenharmony_ci.endm 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci.macro uaddw_sz dst1, dst2, in1, in2, in3, sz 69cabdff1aSopenharmony_ci uaddw \dst1, \in1, \in3\().8b 70cabdff1aSopenharmony_ci.ifc \sz, .16b 71cabdff1aSopenharmony_ci uaddw2 \dst2, \in2, \in3\().16b 72cabdff1aSopenharmony_ci.endif 73cabdff1aSopenharmony_ci.endm 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_ci.macro usubw_sz dst1, dst2, in1, in2, in3, sz 76cabdff1aSopenharmony_ci usubw \dst1, \in1, \in3\().8b 77cabdff1aSopenharmony_ci.ifc \sz, .16b 78cabdff1aSopenharmony_ci usubw2 \dst2, \in2, \in3\().16b 79cabdff1aSopenharmony_ci.endif 80cabdff1aSopenharmony_ci.endm 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci.macro usubl_sz dst1, dst2, in1, in2, sz 83cabdff1aSopenharmony_ci usubl \dst1, \in1\().8b, \in2\().8b 84cabdff1aSopenharmony_ci.ifc \sz, .16b 85cabdff1aSopenharmony_ci usubl2 \dst2, \in1\().16b, \in2\().16b 86cabdff1aSopenharmony_ci.endif 87cabdff1aSopenharmony_ci.endm 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci.macro sqxtn_sz dst, in1, in2, sz 90cabdff1aSopenharmony_ci sqxtn \dst\().8b, \in1 91cabdff1aSopenharmony_ci.ifc \sz, .16b 92cabdff1aSopenharmony_ci sqxtn2 \dst\().16b, \in2 93cabdff1aSopenharmony_ci.endif 94cabdff1aSopenharmony_ci.endm 95cabdff1aSopenharmony_ci 96cabdff1aSopenharmony_ci.macro sqxtun_sz dst, in1, in2, sz 97cabdff1aSopenharmony_ci sqxtun \dst\().8b, \in1 98cabdff1aSopenharmony_ci.ifc \sz, .16b 99cabdff1aSopenharmony_ci sqxtun2 \dst\().16b, \in2 100cabdff1aSopenharmony_ci.endif 101cabdff1aSopenharmony_ci.endm 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz 104cabdff1aSopenharmony_ci mul \dst1, \in1, \in3 105cabdff1aSopenharmony_ci.ifc \sz, .16b 106cabdff1aSopenharmony_ci mul \dst2, \in2, \in4 107cabdff1aSopenharmony_ci.endif 108cabdff1aSopenharmony_ci.endm 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci.macro saddw_sz dst1, dst2, in1, in2, in3, sz 111cabdff1aSopenharmony_ci saddw \dst1, \in1, \in3\().8b 112cabdff1aSopenharmony_ci.ifc \sz, .16b 113cabdff1aSopenharmony_ci saddw2 \dst2, \in2, \in3\().16b 114cabdff1aSopenharmony_ci.endif 115cabdff1aSopenharmony_ci.endm 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci.macro ssubw_sz dst1, dst2, in1, in2, in3, sz 118cabdff1aSopenharmony_ci ssubw \dst1, \in1, \in3\().8b 119cabdff1aSopenharmony_ci.ifc \sz, .16b 120cabdff1aSopenharmony_ci ssubw2 \dst2, \in2, \in3\().16b 121cabdff1aSopenharmony_ci.endif 122cabdff1aSopenharmony_ci.endm 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci.macro uxtl_sz dst1, dst2, in, sz 125cabdff1aSopenharmony_ci uxtl \dst1, \in\().8b 126cabdff1aSopenharmony_ci.ifc \sz, .16b 127cabdff1aSopenharmony_ci uxtl2 \dst2, \in\().16b 128cabdff1aSopenharmony_ci.endif 129cabdff1aSopenharmony_ci.endm 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci.macro uaddl_sz dst1, dst2, in1, in2, sz 132cabdff1aSopenharmony_ci uaddl \dst1, \in1\().8b, \in2\().8b 133cabdff1aSopenharmony_ci.ifc \sz, .16b 134cabdff1aSopenharmony_ci uaddl2 \dst2, \in1\().16b, \in2\().16b 135cabdff1aSopenharmony_ci.endif 136cabdff1aSopenharmony_ci.endm 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci.macro rshrn_sz dst, in1, in2, shift, sz 139cabdff1aSopenharmony_ci rshrn \dst\().8b, \in1, \shift 140cabdff1aSopenharmony_ci.ifc \sz, .16b 141cabdff1aSopenharmony_ci rshrn2 \dst\().16b, \in2, \shift 142cabdff1aSopenharmony_ci.endif 143cabdff1aSopenharmony_ci.endm 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci.macro ushll_sz dst1, dst2, in, shift, sz 146cabdff1aSopenharmony_ci ushll \dst1, \in\().8b, \shift 147cabdff1aSopenharmony_ci.ifc \sz, .16b 148cabdff1aSopenharmony_ci ushll2 \dst2, \in\().16b, \shift 149cabdff1aSopenharmony_ci.endif 150cabdff1aSopenharmony_ci.endm 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci// The input to and output from this macro is in the registers v16-v31, 153cabdff1aSopenharmony_ci// and v0-v7 are used as scratch registers. 154cabdff1aSopenharmony_ci// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 155cabdff1aSopenharmony_ci// Depending on the width of the loop filter, we either use v16-v19 156cabdff1aSopenharmony_ci// and v28-v31 as temp registers, or v8-v15. 157cabdff1aSopenharmony_ci// When comparing to the arm version, tmpq1 == tmp1 + tmp2, 158cabdff1aSopenharmony_ci// tmpq2 == tmp3 + tmp4, etc. 159cabdff1aSopenharmony_ci.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 160cabdff1aSopenharmony_ci.if \mix == 0 161cabdff1aSopenharmony_ci dup v0\sz, w2 // E 162cabdff1aSopenharmony_ci dup v2\sz, w3 // I 163cabdff1aSopenharmony_ci dup v3\sz, w4 // H 164cabdff1aSopenharmony_ci.else 165cabdff1aSopenharmony_ci dup v0.8h, w2 // E 166cabdff1aSopenharmony_ci dup v2.8h, w3 // I 167cabdff1aSopenharmony_ci dup v3.8h, w4 // H 168cabdff1aSopenharmony_ci rev16 v1.16b, v0.16b // E 169cabdff1aSopenharmony_ci rev16 v4.16b, v2.16b // I 170cabdff1aSopenharmony_ci rev16 v5.16b, v3.16b // H 171cabdff1aSopenharmony_ci uzp1 v0.16b, v0.16b, v1.16b 172cabdff1aSopenharmony_ci uzp1 v2.16b, v2.16b, v4.16b 173cabdff1aSopenharmony_ci uzp1 v3.16b, v3.16b, v5.16b 174cabdff1aSopenharmony_ci.endif 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2) 177cabdff1aSopenharmony_ci uabd v5\sz, v21\sz, v22\sz // abs(p2 - p1) 178cabdff1aSopenharmony_ci uabd v6\sz, v22\sz, v23\sz // abs(p1 - p0) 179cabdff1aSopenharmony_ci uabd v7\sz, v24\sz, v25\sz // abs(q0 - q1) 180cabdff1aSopenharmony_ci uabd \tmp1\sz, v25\sz, v26\sz // abs(q1 - q2) 181cabdff1aSopenharmony_ci uabd \tmp2\sz, v26\sz, v27\sz // abs(q2 - q3) 182cabdff1aSopenharmony_ci umax v4\sz, v4\sz, v5\sz 183cabdff1aSopenharmony_ci umax v5\sz, v6\sz, v7\sz 184cabdff1aSopenharmony_ci umax \tmp1\sz, \tmp1\sz, \tmp2\sz 185cabdff1aSopenharmony_ci uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) 186cabdff1aSopenharmony_ci umax v4\sz, v4\sz, v5\sz 187cabdff1aSopenharmony_ci uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 188cabdff1aSopenharmony_ci uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) 189cabdff1aSopenharmony_ci umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) 190cabdff1aSopenharmony_ci ushr v5\sz, v5\sz, #1 191cabdff1aSopenharmony_ci cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I 192cabdff1aSopenharmony_ci uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 193cabdff1aSopenharmony_ci cmhs v5\sz, v0\sz, v6\sz 194cabdff1aSopenharmony_ci and v4\sz, v4\sz, v5\sz // fm 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci // If no pixels need filtering, just exit as soon as possible 197cabdff1aSopenharmony_ci mov x5, v4.d[0] 198cabdff1aSopenharmony_ci.ifc \sz, .16b 199cabdff1aSopenharmony_ci mov x6, v4.d[1] 200cabdff1aSopenharmony_ci adds x5, x5, x6 201cabdff1aSopenharmony_ci b.eq 9f 202cabdff1aSopenharmony_ci.else 203cabdff1aSopenharmony_ci cbz x5, 9f 204cabdff1aSopenharmony_ci.endif 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ci.if \wd >= 8 207cabdff1aSopenharmony_ci movi v0\sz, #1 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci uabd v6\sz, v20\sz, v23\sz // abs(p3 - p0) 210cabdff1aSopenharmony_ci uabd v2\sz, v21\sz, v23\sz // abs(p2 - p0) 211cabdff1aSopenharmony_ci uabd v1\sz, v22\sz, v23\sz // abs(p1 - p0) 212cabdff1aSopenharmony_ci uabd \tmp1\sz, v25\sz, v24\sz // abs(q1 - q0) 213cabdff1aSopenharmony_ci uabd \tmp2\sz, v26\sz, v24\sz // abs(q2 - q0) 214cabdff1aSopenharmony_ci uabd \tmp3\sz, v27\sz, v24\sz // abs(q3 - q0) 215cabdff1aSopenharmony_ci umax v6\sz, v6\sz, v2\sz 216cabdff1aSopenharmony_ci umax v1\sz, v1\sz, \tmp1\sz 217cabdff1aSopenharmony_ci umax \tmp2\sz, \tmp2\sz, \tmp3\sz 218cabdff1aSopenharmony_ci.if \wd == 16 219cabdff1aSopenharmony_ci uabd v7\sz, v16\sz, v23\sz // abs(p7 - p0) 220cabdff1aSopenharmony_ci umax v6\sz, v6\sz, v1\sz 221cabdff1aSopenharmony_ci uabd v2\sz, v17\sz, v23\sz // abs(p6 - p0) 222cabdff1aSopenharmony_ci umax v6\sz, v6\sz, \tmp2\sz 223cabdff1aSopenharmony_ci uabd v1\sz, v18\sz, v23\sz // abs(p5 - p0) 224cabdff1aSopenharmony_ci cmhs v6\sz, v0\sz, v6\sz // flat8in 225cabdff1aSopenharmony_ci uabd v8\sz, v19\sz, v23\sz // abs(p4 - p0) 226cabdff1aSopenharmony_ci and v6\sz, v6\sz, v4\sz // flat8in && fm 227cabdff1aSopenharmony_ci uabd v9\sz, v28\sz, v24\sz // abs(q4 - q0) 228cabdff1aSopenharmony_ci bic v4\sz, v4\sz, v6\sz // fm && !flat8in 229cabdff1aSopenharmony_ci uabd v10\sz, v29\sz, v24\sz // abs(q5 - q0) 230cabdff1aSopenharmony_ci uabd v11\sz, v30\sz, v24\sz // abs(q6 - q0) 231cabdff1aSopenharmony_ci uabd v12\sz, v31\sz, v24\sz // abs(q7 - q0) 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci umax v7\sz, v7\sz, v2\sz 234cabdff1aSopenharmony_ci umax v1\sz, v1\sz, v8\sz 235cabdff1aSopenharmony_ci umax v9\sz, v9\sz, v10\sz 236cabdff1aSopenharmony_ci umax v11\sz, v11\sz, v12\sz 237cabdff1aSopenharmony_ci // The rest of the calculation of flat8out is interleaved below 238cabdff1aSopenharmony_ci.else 239cabdff1aSopenharmony_ci // The rest of the calculation of flat8in is interleaved below 240cabdff1aSopenharmony_ci.endif 241cabdff1aSopenharmony_ci.endif 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci // Calculate the normal inner loop filter for 2 or 4 pixels 244cabdff1aSopenharmony_ci uabd v5\sz, v22\sz, v23\sz // abs(p1 - p0) 245cabdff1aSopenharmony_ci.if \wd == 16 246cabdff1aSopenharmony_ci umax v7\sz, v7\sz, v1\sz 247cabdff1aSopenharmony_ci umax v9\sz, v9\sz, v11\sz 248cabdff1aSopenharmony_ci.elseif \wd == 8 249cabdff1aSopenharmony_ci umax v6\sz, v6\sz, v1\sz 250cabdff1aSopenharmony_ci.endif 251cabdff1aSopenharmony_ci uabd v1\sz, v25\sz, v24\sz // abs(q1 - q0) 252cabdff1aSopenharmony_ci.if \wd == 16 253cabdff1aSopenharmony_ci umax v7\sz, v7\sz, v9\sz 254cabdff1aSopenharmony_ci.elseif \wd == 8 255cabdff1aSopenharmony_ci umax v6\sz, v6\sz, \tmp2\sz 256cabdff1aSopenharmony_ci.endif 257cabdff1aSopenharmony_ci usubl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz // p1 - q1 258cabdff1aSopenharmony_ci umax v5\sz, v5\sz, v1\sz // max(abs(p1 - p0), abs(q1 - q0)) 259cabdff1aSopenharmony_ci.if \mix != 0 260cabdff1aSopenharmony_ci mov v1.d[0], x11 261cabdff1aSopenharmony_ci.endif 262cabdff1aSopenharmony_ci usubl_sz \tmp3\().8h, \tmp4\().8h, v24, v23, \sz // q0 - p0 263cabdff1aSopenharmony_ci movi \tmp5\().8h, #3 264cabdff1aSopenharmony_ci.if \wd == 8 265cabdff1aSopenharmony_ci cmhs v6\sz, v0\sz, v6\sz // flat8in 266cabdff1aSopenharmony_ci.endif 267cabdff1aSopenharmony_ci.if \mix != 0 268cabdff1aSopenharmony_ci sxtl v1.8h, v1.8b 269cabdff1aSopenharmony_ci.endif 270cabdff1aSopenharmony_ci cmhs v5\sz, v3\sz, v5\sz // !hev 271cabdff1aSopenharmony_ci.if \wd == 8 272cabdff1aSopenharmony_ci // If a 4/8 or 8/4 mix is used, clear the relevant half of v6 273cabdff1aSopenharmony_ci.if \mix != 0 274cabdff1aSopenharmony_ci and v6\sz, v6\sz, v1.16b 275cabdff1aSopenharmony_ci.endif 276cabdff1aSopenharmony_ci and v6\sz, v6\sz, v4\sz // flat8in && fm 277cabdff1aSopenharmony_ci.endif 278cabdff1aSopenharmony_ci sqxtn_sz \tmp1, \tmp1\().8h, \tmp2\().8h, \sz // av_clip_int8(p1 - q1) 279cabdff1aSopenharmony_ci.if \wd == 16 280cabdff1aSopenharmony_ci cmhs v7\sz, v0\sz, v7\sz // flat8out 281cabdff1aSopenharmony_ci.elseif \wd == 8 282cabdff1aSopenharmony_ci bic v4\sz, v4\sz, v6\sz // fm && !flat8in 283cabdff1aSopenharmony_ci.endif 284cabdff1aSopenharmony_ci and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in 285cabdff1aSopenharmony_ci.if \wd == 16 286cabdff1aSopenharmony_ci and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm 287cabdff1aSopenharmony_ci.endif 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0) 290cabdff1aSopenharmony_ci bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) av_clip_int8 = 0 291cabdff1aSopenharmony_ci movi v2\sz, #4 292cabdff1aSopenharmony_ci saddw_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 293cabdff1aSopenharmony_ci movi v3\sz, #3 294cabdff1aSopenharmony_ci sqxtn_sz \tmp1, \tmp3\().8h, \tmp4\().8h, \sz // f 295cabdff1aSopenharmony_ci.if \wd == 16 296cabdff1aSopenharmony_ci bic v6\sz, v6\sz, v7\sz // fm && flat8in && !flat8out 297cabdff1aSopenharmony_ci.endif 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ci sqadd \tmp3\sz, \tmp1\sz, v2\sz // FFMIN(f + 4, 127) 300cabdff1aSopenharmony_ci sqadd \tmp4\sz, \tmp1\sz, v3\sz // FFMIN(f + 3, 127) 301cabdff1aSopenharmony_ci uxtl_sz v0.8h, v1.8h, v23, \sz // p0 302cabdff1aSopenharmony_ci sshr \tmp3\sz, \tmp3\sz, #3 // f1 303cabdff1aSopenharmony_ci sshr \tmp4\sz, \tmp4\sz, #3 // f2 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ci uxtl_sz v2.8h, v3.8h, v24, \sz // q0 306cabdff1aSopenharmony_ci saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp4, \sz // p0 + f2 307cabdff1aSopenharmony_ci ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q0 - f1 308cabdff1aSopenharmony_ci sqxtun_sz v0, v0.8h, v1.8h, \sz // out p0 309cabdff1aSopenharmony_ci sqxtun_sz v1, v2.8h, v3.8h, \sz // out q0 310cabdff1aSopenharmony_ci srshr \tmp3\sz, \tmp3\sz, #1 // f = (f1 + 1) >> 1 311cabdff1aSopenharmony_ci bit v23\sz, v0\sz, v4\sz // if (fm && !flat8in) 312cabdff1aSopenharmony_ci bit v24\sz, v1\sz, v4\sz 313cabdff1aSopenharmony_ci 314cabdff1aSopenharmony_ci uxtl_sz v0.8h, v1.8h, v22, \sz // p1 315cabdff1aSopenharmony_ci uxtl_sz v2.8h, v3.8h, v25, \sz // q1 316cabdff1aSopenharmony_ci.if \wd >= 8 317cabdff1aSopenharmony_ci mov x5, v6.d[0] 318cabdff1aSopenharmony_ci.ifc \sz, .16b 319cabdff1aSopenharmony_ci mov x6, v6.d[1] 320cabdff1aSopenharmony_ci.endif 321cabdff1aSopenharmony_ci.endif 322cabdff1aSopenharmony_ci saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f 323cabdff1aSopenharmony_ci ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f 324cabdff1aSopenharmony_ci sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1 325cabdff1aSopenharmony_ci sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1 326cabdff1aSopenharmony_ci.if \wd >= 8 327cabdff1aSopenharmony_ci.ifc \sz, .16b 328cabdff1aSopenharmony_ci adds x5, x5, x6 329cabdff1aSopenharmony_ci.endif 330cabdff1aSopenharmony_ci.endif 331cabdff1aSopenharmony_ci bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in) 332cabdff1aSopenharmony_ci bit v25\sz, v2\sz, v5\sz 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ci // If no pixels need flat8in, jump to flat8out 335cabdff1aSopenharmony_ci // (or to a writeout of the inner 4 pixels, for wd=8) 336cabdff1aSopenharmony_ci.if \wd >= 8 337cabdff1aSopenharmony_ci.ifc \sz, .16b 338cabdff1aSopenharmony_ci b.eq 6f 339cabdff1aSopenharmony_ci.else 340cabdff1aSopenharmony_ci cbz x5, 6f 341cabdff1aSopenharmony_ci.endif 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci // flat8in 344cabdff1aSopenharmony_ci uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v21, \sz 345cabdff1aSopenharmony_ci uaddl_sz \tmp3\().8h, \tmp4\().8h, v22, v25, \sz 346cabdff1aSopenharmony_ci uaddl_sz \tmp5\().8h, \tmp6\().8h, v20, v22, \sz 347cabdff1aSopenharmony_ci uaddl_sz \tmp7\().8h, \tmp8\().8h, v23, v26, \sz 348cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz 349cabdff1aSopenharmony_ci uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v23, \sz 350cabdff1aSopenharmony_ci uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v24, \sz 351cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp5\().8h, \tmp6\().8h, \sz 352cabdff1aSopenharmony_ci sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz 353cabdff1aSopenharmony_ci sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz 354cabdff1aSopenharmony_ci rshrn_sz v2, v0.8h, v1.8h, #3, \sz // out p2 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz 357cabdff1aSopenharmony_ci uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v23, \sz 358cabdff1aSopenharmony_ci uaddl_sz \tmp3\().8h, \tmp4\().8h, v24, v27, \sz 359cabdff1aSopenharmony_ci rshrn_sz v3, v0.8h, v1.8h, #3, \sz // out p1 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz 362cabdff1aSopenharmony_ci sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz 363cabdff1aSopenharmony_ci uaddl_sz \tmp5\().8h, \tmp6\().8h, v21, v24, \sz 364cabdff1aSopenharmony_ci uaddl_sz \tmp7\().8h, \tmp8\().8h, v25, v27, \sz 365cabdff1aSopenharmony_ci rshrn_sz v4, v0.8h, v1.8h, #3, \sz // out p0 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz 368cabdff1aSopenharmony_ci sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz 369cabdff1aSopenharmony_ci uaddl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz 370cabdff1aSopenharmony_ci uaddl_sz \tmp3\().8h, \tmp4\().8h, v26, v27, \sz 371cabdff1aSopenharmony_ci rshrn_sz v5, v0.8h, v1.8h, #3, \sz // out q0 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz 374cabdff1aSopenharmony_ci sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz 375cabdff1aSopenharmony_ci rshrn_sz \tmp5, v0.8h, v1.8h, #3, \sz // out q1 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz 378cabdff1aSopenharmony_ci // The output here is written back into the input registers. This doesn't 379cabdff1aSopenharmony_ci // matter for the flat8part below, since we only update those pixels 380cabdff1aSopenharmony_ci // which won't be touched below. 381cabdff1aSopenharmony_ci bit v21\sz, v2\sz, v6\sz 382cabdff1aSopenharmony_ci bit v22\sz, v3\sz, v6\sz 383cabdff1aSopenharmony_ci bit v23\sz, v4\sz, v6\sz 384cabdff1aSopenharmony_ci rshrn_sz \tmp6, v0.8h, v1.8h, #3, \sz // out q2 385cabdff1aSopenharmony_ci bit v24\sz, v5\sz, v6\sz 386cabdff1aSopenharmony_ci bit v25\sz, \tmp5\sz, v6\sz 387cabdff1aSopenharmony_ci bit v26\sz, \tmp6\sz, v6\sz 388cabdff1aSopenharmony_ci.endif 389cabdff1aSopenharmony_ci.if \wd == 16 390cabdff1aSopenharmony_ci6: 391cabdff1aSopenharmony_ci orr v2\sz, v6\sz, v7\sz 392cabdff1aSopenharmony_ci mov x5, v2.d[0] 393cabdff1aSopenharmony_ci.ifc \sz, .16b 394cabdff1aSopenharmony_ci mov x6, v2.d[1] 395cabdff1aSopenharmony_ci adds x5, x5, x6 396cabdff1aSopenharmony_ci b.ne 1f 397cabdff1aSopenharmony_ci.else 398cabdff1aSopenharmony_ci cbnz x5, 1f 399cabdff1aSopenharmony_ci.endif 400cabdff1aSopenharmony_ci // If no pixels needed flat8in nor flat8out, jump to a 401cabdff1aSopenharmony_ci // writeout of the inner 4 pixels 402cabdff1aSopenharmony_ci ret x14 403cabdff1aSopenharmony_ci1: 404cabdff1aSopenharmony_ci 405cabdff1aSopenharmony_ci mov x5, v7.d[0] 406cabdff1aSopenharmony_ci.ifc \sz, .16b 407cabdff1aSopenharmony_ci mov x6, v7.d[1] 408cabdff1aSopenharmony_ci adds x5, x5, x6 409cabdff1aSopenharmony_ci b.ne 1f 410cabdff1aSopenharmony_ci.else 411cabdff1aSopenharmony_ci cbnz x5, 1f 412cabdff1aSopenharmony_ci.endif 413cabdff1aSopenharmony_ci // If no pixels need flat8out, jump to a writeout of the inner 6 pixels 414cabdff1aSopenharmony_ci ret x15 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci1: 417cabdff1aSopenharmony_ci // flat8out 418cabdff1aSopenharmony_ci // This writes all outputs into v2-v17 (skipping v6 and v16). 419cabdff1aSopenharmony_ci // If this part is skipped, the output is read from v21-v26 (which is the input 420cabdff1aSopenharmony_ci // to this section). 421cabdff1aSopenharmony_ci ushll_sz v0.8h, v1.8h, v16, #3, \sz // 8 * v16 422cabdff1aSopenharmony_ci usubw_sz v0.8h, v1.8h, v0.8h, v1.8h, v16, \sz // 7 * v16 423cabdff1aSopenharmony_ci uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v17, \sz 424cabdff1aSopenharmony_ci uaddl_sz v8.8h, v9.8h, v17, v18, \sz 425cabdff1aSopenharmony_ci uaddl_sz v10.8h, v11.8h, v19, v20, \sz 426cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v8.8h, v9.8h, \sz 427cabdff1aSopenharmony_ci uaddl_sz v8.8h, v9.8h, v16, v17, \sz 428cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v21, v22, \sz 429cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 430cabdff1aSopenharmony_ci uaddl_sz v10.8h, v11.8h, v18, v25, \sz 431cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v23, v24, \sz 432cabdff1aSopenharmony_ci sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz 433cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz 434cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 435cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v16, v18, \sz 436cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v19, v26, \sz 437cabdff1aSopenharmony_ci rshrn_sz v2, v0.8h, v1.8h, #4, \sz 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 440cabdff1aSopenharmony_ci uaddl_sz v8.8h, v9.8h, v16, v19, \sz 441cabdff1aSopenharmony_ci uaddl_sz v10.8h, v11.8h, v20, v27, \sz 442cabdff1aSopenharmony_ci sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 443cabdff1aSopenharmony_ci bif v2\sz, v17\sz, v7\sz 444cabdff1aSopenharmony_ci rshrn_sz v3, v0.8h, v1.8h, #4, \sz 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 447cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v16, v20, \sz 448cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v21, v28, \sz 449cabdff1aSopenharmony_ci sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz 450cabdff1aSopenharmony_ci bif v3\sz, v18\sz, v7\sz 451cabdff1aSopenharmony_ci rshrn_sz v4, v0.8h, v1.8h, #4, \sz 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 454cabdff1aSopenharmony_ci uaddl_sz v8.8h, v9.8h, v16, v21, \sz 455cabdff1aSopenharmony_ci uaddl_sz v10.8h, v11.8h, v22, v29, \sz 456cabdff1aSopenharmony_ci sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 457cabdff1aSopenharmony_ci bif v4\sz, v19\sz, v7\sz 458cabdff1aSopenharmony_ci rshrn_sz v5, v0.8h, v1.8h, #4, \sz 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 461cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v16, v22, \sz 462cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v23, v30, \sz 463cabdff1aSopenharmony_ci sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz 464cabdff1aSopenharmony_ci bif v5\sz, v20\sz, v7\sz 465cabdff1aSopenharmony_ci rshrn_sz v6, v0.8h, v1.8h, #4, \sz 466cabdff1aSopenharmony_ci 467cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 468cabdff1aSopenharmony_ci uaddl_sz v10.8h, v11.8h, v16, v23, \sz 469cabdff1aSopenharmony_ci sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 470cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v24, v31, \sz 471cabdff1aSopenharmony_ci bif v6\sz, v21\sz, v7\sz 472cabdff1aSopenharmony_ci rshrn_sz v8, v0.8h, v1.8h, #4, \sz 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 475cabdff1aSopenharmony_ci sub_sz v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz 476cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v17, v24, \sz 477cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v25, v31, \sz 478cabdff1aSopenharmony_ci bif v8\sz, v22\sz, v7\sz 479cabdff1aSopenharmony_ci rshrn_sz v9, v0.8h, v1.8h, #4, \sz 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 482cabdff1aSopenharmony_ci sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 483cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v26, v31, \sz 484cabdff1aSopenharmony_ci bif v9\sz, v23\sz, v7\sz 485cabdff1aSopenharmony_ci rshrn_sz v10, v0.8h, v1.8h, #4, \sz 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 488cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v18, v25, \sz 489cabdff1aSopenharmony_ci uaddl_sz v18.8h, v19.8h, v19, v26, \sz 490cabdff1aSopenharmony_ci sub_sz v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz 491cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v27, v31, \sz 492cabdff1aSopenharmony_ci bif v10\sz, v24\sz, v7\sz 493cabdff1aSopenharmony_ci rshrn_sz v11, v0.8h, v1.8h, #4, \sz 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz 496cabdff1aSopenharmony_ci uaddl_sz v12.8h, v13.8h, v20, v27, \sz 497cabdff1aSopenharmony_ci sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz 498cabdff1aSopenharmony_ci uaddl_sz v18.8h, v19.8h, v28, v31, \sz 499cabdff1aSopenharmony_ci bif v11\sz, v25\sz, v7\sz 500cabdff1aSopenharmony_ci sub_sz v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz 501cabdff1aSopenharmony_ci rshrn_sz v12, v0.8h, v1.8h, #4, \sz 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 504cabdff1aSopenharmony_ci uaddl_sz v14.8h, v15.8h, v21, v28, \sz 505cabdff1aSopenharmony_ci uaddl_sz v20.8h, v21.8h, v29, v31, \sz 506cabdff1aSopenharmony_ci bif v12\sz, v26\sz, v7\sz 507cabdff1aSopenharmony_ci rshrn_sz v13, v0.8h, v1.8h, #4, \sz 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v18.8h, v19.8h, \sz 510cabdff1aSopenharmony_ci sub_sz v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz 511cabdff1aSopenharmony_ci uaddl_sz v18.8h, v19.8h, v22, v29, \sz 512cabdff1aSopenharmony_ci uaddl_sz v22.8h, v23.8h, v30, v31, \sz 513cabdff1aSopenharmony_ci bif v13\sz, v27\sz, v7\sz 514cabdff1aSopenharmony_ci rshrn_sz v14, v0.8h, v1.8h, #4, \sz 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v20.8h, v21.8h, \sz 517cabdff1aSopenharmony_ci sub_sz v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz 518cabdff1aSopenharmony_ci bif v14\sz, v28\sz, v7\sz 519cabdff1aSopenharmony_ci rshrn_sz v15, v0.8h, v1.8h, #4, \sz 520cabdff1aSopenharmony_ci 521cabdff1aSopenharmony_ci add_sz v0.8h, v1.8h, v0.8h, v1.8h, v22.8h, v23.8h, \sz 522cabdff1aSopenharmony_ci bif v15\sz, v29\sz, v7\sz 523cabdff1aSopenharmony_ci rshrn_sz v17, v0.8h, v1.8h, #4, \sz 524cabdff1aSopenharmony_ci bif v17\sz, v30\sz, v7\sz 525cabdff1aSopenharmony_ci.endif 526cabdff1aSopenharmony_ci.endm 527cabdff1aSopenharmony_ci 528cabdff1aSopenharmony_ci// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, 529cabdff1aSopenharmony_ci// while we need those for inputs/outputs in wd=16 and use v8-v15 530cabdff1aSopenharmony_ci// for temp registers there instead. 531cabdff1aSopenharmony_cifunction vp9_loop_filter_4 532cabdff1aSopenharmony_ci loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 533cabdff1aSopenharmony_ci ret 534cabdff1aSopenharmony_ci9: 535cabdff1aSopenharmony_ci ret x10 536cabdff1aSopenharmony_ciendfunc 537cabdff1aSopenharmony_ci 538cabdff1aSopenharmony_cifunction vp9_loop_filter_4_16b_mix_44 539cabdff1aSopenharmony_ci loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31 540cabdff1aSopenharmony_ci ret 541cabdff1aSopenharmony_ci9: 542cabdff1aSopenharmony_ci ret x10 543cabdff1aSopenharmony_ciendfunc 544cabdff1aSopenharmony_ci 545cabdff1aSopenharmony_cifunction vp9_loop_filter_8 546cabdff1aSopenharmony_ci loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 547cabdff1aSopenharmony_ci ret 548cabdff1aSopenharmony_ci6: 549cabdff1aSopenharmony_ci ret x13 550cabdff1aSopenharmony_ci9: 551cabdff1aSopenharmony_ci ret x10 552cabdff1aSopenharmony_ciendfunc 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_cifunction vp9_loop_filter_8_16b_mix 555cabdff1aSopenharmony_ci loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 556cabdff1aSopenharmony_ci ret 557cabdff1aSopenharmony_ci6: 558cabdff1aSopenharmony_ci ret x13 559cabdff1aSopenharmony_ci9: 560cabdff1aSopenharmony_ci ret x10 561cabdff1aSopenharmony_ciendfunc 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_cifunction vp9_loop_filter_16 564cabdff1aSopenharmony_ci loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 565cabdff1aSopenharmony_ci ret 566cabdff1aSopenharmony_ci9: 567cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 568cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 569cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 570cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 571cabdff1aSopenharmony_ci ret x10 572cabdff1aSopenharmony_ciendfunc 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_cifunction vp9_loop_filter_16_16b 575cabdff1aSopenharmony_ci loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 576cabdff1aSopenharmony_ci ret 577cabdff1aSopenharmony_ci9: 578cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 579cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 580cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 581cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 582cabdff1aSopenharmony_ci ret x10 583cabdff1aSopenharmony_ciendfunc 584cabdff1aSopenharmony_ci 585cabdff1aSopenharmony_ci.macro loop_filter_4 586cabdff1aSopenharmony_ci bl vp9_loop_filter_4 587cabdff1aSopenharmony_ci.endm 588cabdff1aSopenharmony_ci 589cabdff1aSopenharmony_ci.macro loop_filter_4_16b_mix mix 590cabdff1aSopenharmony_ci bl vp9_loop_filter_4_16b_mix_\mix 591cabdff1aSopenharmony_ci.endm 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci.macro loop_filter_8 594cabdff1aSopenharmony_ci // calculate alternative 'return' targets 595cabdff1aSopenharmony_ci adr x13, 6f 596cabdff1aSopenharmony_ci bl vp9_loop_filter_8 597cabdff1aSopenharmony_ci.endm 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_ci.macro loop_filter_8_16b_mix mix 600cabdff1aSopenharmony_ci // calculate alternative 'return' targets 601cabdff1aSopenharmony_ci adr x13, 6f 602cabdff1aSopenharmony_ci.if \mix == 48 603cabdff1aSopenharmony_ci mov x11, #0xffffffff00000000 604cabdff1aSopenharmony_ci.elseif \mix == 84 605cabdff1aSopenharmony_ci mov x11, #0x00000000ffffffff 606cabdff1aSopenharmony_ci.else 607cabdff1aSopenharmony_ci mov x11, #0xffffffffffffffff 608cabdff1aSopenharmony_ci.endif 609cabdff1aSopenharmony_ci bl vp9_loop_filter_8_16b_mix 610cabdff1aSopenharmony_ci.endm 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci.macro loop_filter_16 613cabdff1aSopenharmony_ci // calculate alternative 'return' targets 614cabdff1aSopenharmony_ci adr x14, 7f 615cabdff1aSopenharmony_ci adr x15, 8f 616cabdff1aSopenharmony_ci bl vp9_loop_filter_16 617cabdff1aSopenharmony_ci.endm 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci.macro loop_filter_16_16b 620cabdff1aSopenharmony_ci // calculate alternative 'return' targets 621cabdff1aSopenharmony_ci adr x14, 7f 622cabdff1aSopenharmony_ci adr x15, 8f 623cabdff1aSopenharmony_ci bl vp9_loop_filter_16_16b 624cabdff1aSopenharmony_ci.endm 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci// The public functions in this file have got the following signature: 628cabdff1aSopenharmony_ci// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_4_8_neon, export=1 631cabdff1aSopenharmony_ci mov x10, x30 632cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #2 633cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 // p3 634cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 // q0 635cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 // p2 636cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 // q1 637cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 // p1 638cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 // q2 639cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 // p0 640cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 // q3 641cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 642cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #1 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci loop_filter_4 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 647cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 648cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 649cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 650cabdff1aSopenharmony_ci 651cabdff1aSopenharmony_ci ret x10 652cabdff1aSopenharmony_ciendfunc 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_44_16_neon, export=1 655cabdff1aSopenharmony_ci mov x10, x30 656cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #2 657cabdff1aSopenharmony_ci ld1 {v20.16b}, [x9], x1 // p3 658cabdff1aSopenharmony_ci ld1 {v24.16b}, [x0], x1 // q0 659cabdff1aSopenharmony_ci ld1 {v21.16b}, [x9], x1 // p2 660cabdff1aSopenharmony_ci ld1 {v25.16b}, [x0], x1 // q1 661cabdff1aSopenharmony_ci ld1 {v22.16b}, [x9], x1 // p1 662cabdff1aSopenharmony_ci ld1 {v26.16b}, [x0], x1 // q2 663cabdff1aSopenharmony_ci ld1 {v23.16b}, [x9], x1 // p0 664cabdff1aSopenharmony_ci ld1 {v27.16b}, [x0], x1 // q3 665cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 666cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #1 667cabdff1aSopenharmony_ci 668cabdff1aSopenharmony_ci loop_filter_4_16b_mix 44 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_ci st1 {v22.16b}, [x9], x1 671cabdff1aSopenharmony_ci st1 {v24.16b}, [x0], x1 672cabdff1aSopenharmony_ci st1 {v23.16b}, [x9], x1 673cabdff1aSopenharmony_ci st1 {v25.16b}, [x0], x1 674cabdff1aSopenharmony_ci 675cabdff1aSopenharmony_ci ret x10 676cabdff1aSopenharmony_ciendfunc 677cabdff1aSopenharmony_ci 678cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_4_8_neon, export=1 679cabdff1aSopenharmony_ci mov x10, x30 680cabdff1aSopenharmony_ci sub x9, x0, #4 681cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 682cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 683cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 684cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 685cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 686cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 687cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 688cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 689cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 692cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 693cabdff1aSopenharmony_ci // Move x0/x9 forward by 2 pixels; we don't need to rewrite the 694cabdff1aSopenharmony_ci // outermost 2 pixels since they aren't changed. 695cabdff1aSopenharmony_ci add x9, x9, #2 696cabdff1aSopenharmony_ci add x0, x0, #2 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 699cabdff1aSopenharmony_ci 700cabdff1aSopenharmony_ci loop_filter_4 701cabdff1aSopenharmony_ci 702cabdff1aSopenharmony_ci // We only will write the mid 4 pixels back; after the loop filter, 703cabdff1aSopenharmony_ci // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). 704cabdff1aSopenharmony_ci // We need to transpose them to columns, done with a 4x8 transpose 705cabdff1aSopenharmony_ci // (which in practice is two 4x4 transposes of the two 4x4 halves 706cabdff1aSopenharmony_ci // of the 8x4 pixels; into 4x8 pixels). 707cabdff1aSopenharmony_ci transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 708cabdff1aSopenharmony_ci st1 {v22.s}[0], [x9], x1 709cabdff1aSopenharmony_ci st1 {v22.s}[1], [x0], x1 710cabdff1aSopenharmony_ci st1 {v23.s}[0], [x9], x1 711cabdff1aSopenharmony_ci st1 {v23.s}[1], [x0], x1 712cabdff1aSopenharmony_ci st1 {v24.s}[0], [x9], x1 713cabdff1aSopenharmony_ci st1 {v24.s}[1], [x0], x1 714cabdff1aSopenharmony_ci st1 {v25.s}[0], [x9], x1 715cabdff1aSopenharmony_ci st1 {v25.s}[1], [x0], x1 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci ret x10 718cabdff1aSopenharmony_ciendfunc 719cabdff1aSopenharmony_ci 720cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_44_16_neon, export=1 721cabdff1aSopenharmony_ci mov x10, x30 722cabdff1aSopenharmony_ci sub x9, x0, #4 723cabdff1aSopenharmony_ci add x0, x9, x1, lsl #3 724cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 725cabdff1aSopenharmony_ci ld1 {v20.d}[1], [x0], x1 726cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 727cabdff1aSopenharmony_ci ld1 {v21.d}[1], [x0], x1 728cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 729cabdff1aSopenharmony_ci ld1 {v22.d}[1], [x0], x1 730cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 731cabdff1aSopenharmony_ci ld1 {v23.d}[1], [x0], x1 732cabdff1aSopenharmony_ci ld1 {v24.8b}, [x9], x1 733cabdff1aSopenharmony_ci ld1 {v24.d}[1], [x0], x1 734cabdff1aSopenharmony_ci ld1 {v25.8b}, [x9], x1 735cabdff1aSopenharmony_ci ld1 {v25.d}[1], [x0], x1 736cabdff1aSopenharmony_ci ld1 {v26.8b}, [x9], x1 737cabdff1aSopenharmony_ci ld1 {v26.d}[1], [x0], x1 738cabdff1aSopenharmony_ci ld1 {v27.8b}, [x9], x1 739cabdff1aSopenharmony_ci ld1 {v27.d}[1], [x0], x1 740cabdff1aSopenharmony_ci 741cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 742cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 743cabdff1aSopenharmony_ci add x9, x9, #2 744cabdff1aSopenharmony_ci add x0, x0, #2 745cabdff1aSopenharmony_ci 746cabdff1aSopenharmony_ci transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 747cabdff1aSopenharmony_ci 748cabdff1aSopenharmony_ci loop_filter_4_16b_mix 44 749cabdff1aSopenharmony_ci 750cabdff1aSopenharmony_ci transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 751cabdff1aSopenharmony_ci 752cabdff1aSopenharmony_ci st1 {v22.s}[0], [x9], x1 753cabdff1aSopenharmony_ci st1 {v22.s}[2], [x0], x1 754cabdff1aSopenharmony_ci st1 {v23.s}[0], [x9], x1 755cabdff1aSopenharmony_ci st1 {v23.s}[2], [x0], x1 756cabdff1aSopenharmony_ci st1 {v24.s}[0], [x9], x1 757cabdff1aSopenharmony_ci st1 {v24.s}[2], [x0], x1 758cabdff1aSopenharmony_ci st1 {v25.s}[0], [x9], x1 759cabdff1aSopenharmony_ci st1 {v25.s}[2], [x0], x1 760cabdff1aSopenharmony_ci st1 {v22.s}[1], [x9], x1 761cabdff1aSopenharmony_ci st1 {v22.s}[3], [x0], x1 762cabdff1aSopenharmony_ci st1 {v23.s}[1], [x9], x1 763cabdff1aSopenharmony_ci st1 {v23.s}[3], [x0], x1 764cabdff1aSopenharmony_ci st1 {v24.s}[1], [x9], x1 765cabdff1aSopenharmony_ci st1 {v24.s}[3], [x0], x1 766cabdff1aSopenharmony_ci st1 {v25.s}[1], [x9], x1 767cabdff1aSopenharmony_ci st1 {v25.s}[3], [x0], x1 768cabdff1aSopenharmony_ci 769cabdff1aSopenharmony_ci ret x10 770cabdff1aSopenharmony_ciendfunc 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_8_8_neon, export=1 773cabdff1aSopenharmony_ci mov x10, x30 774cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #2 775cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 // p3 776cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 // q0 777cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 // p2 778cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 // q1 779cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 // p1 780cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 // q2 781cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 // p0 782cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 // q3 783cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 784cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 785cabdff1aSopenharmony_ci add x9, x9, x1 786cabdff1aSopenharmony_ci 787cabdff1aSopenharmony_ci loop_filter_8 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci st1 {v21.8b}, [x9], x1 790cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 791cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 792cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 793cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 794cabdff1aSopenharmony_ci st1 {v26.8b}, [x0], x1 795cabdff1aSopenharmony_ci 796cabdff1aSopenharmony_ci ret x10 797cabdff1aSopenharmony_ci6: 798cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #1 799cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 800cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 801cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 802cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 803cabdff1aSopenharmony_ci ret x10 804cabdff1aSopenharmony_ciendfunc 805cabdff1aSopenharmony_ci 806cabdff1aSopenharmony_ci.macro mix_v_16 mix 807cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_\mix\()_16_neon, export=1 808cabdff1aSopenharmony_ci mov x10, x30 809cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #2 810cabdff1aSopenharmony_ci ld1 {v20.16b}, [x9], x1 // p3 811cabdff1aSopenharmony_ci ld1 {v24.16b}, [x0], x1 // q0 812cabdff1aSopenharmony_ci ld1 {v21.16b}, [x9], x1 // p2 813cabdff1aSopenharmony_ci ld1 {v25.16b}, [x0], x1 // q1 814cabdff1aSopenharmony_ci ld1 {v22.16b}, [x9], x1 // p1 815cabdff1aSopenharmony_ci ld1 {v26.16b}, [x0], x1 // q2 816cabdff1aSopenharmony_ci ld1 {v23.16b}, [x9], x1 // p0 817cabdff1aSopenharmony_ci ld1 {v27.16b}, [x0], x1 // q3 818cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 819cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 820cabdff1aSopenharmony_ci add x9, x9, x1 821cabdff1aSopenharmony_ci 822cabdff1aSopenharmony_ci loop_filter_8_16b_mix \mix 823cabdff1aSopenharmony_ci 824cabdff1aSopenharmony_ci st1 {v21.16b}, [x9], x1 825cabdff1aSopenharmony_ci st1 {v24.16b}, [x0], x1 826cabdff1aSopenharmony_ci st1 {v22.16b}, [x9], x1 827cabdff1aSopenharmony_ci st1 {v25.16b}, [x0], x1 828cabdff1aSopenharmony_ci st1 {v23.16b}, [x9], x1 829cabdff1aSopenharmony_ci st1 {v26.16b}, [x0], x1 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_ci ret x10 832cabdff1aSopenharmony_ci6: 833cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #1 834cabdff1aSopenharmony_ci st1 {v22.16b}, [x9], x1 835cabdff1aSopenharmony_ci st1 {v24.16b}, [x0], x1 836cabdff1aSopenharmony_ci st1 {v23.16b}, [x9], x1 837cabdff1aSopenharmony_ci st1 {v25.16b}, [x0], x1 838cabdff1aSopenharmony_ci ret x10 839cabdff1aSopenharmony_ciendfunc 840cabdff1aSopenharmony_ci.endm 841cabdff1aSopenharmony_ci 842cabdff1aSopenharmony_cimix_v_16 48 843cabdff1aSopenharmony_cimix_v_16 84 844cabdff1aSopenharmony_cimix_v_16 88 845cabdff1aSopenharmony_ci 846cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_8_8_neon, export=1 847cabdff1aSopenharmony_ci mov x10, x30 848cabdff1aSopenharmony_ci sub x9, x0, #4 849cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 850cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 851cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 852cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 853cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 854cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 855cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 856cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 857cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 858cabdff1aSopenharmony_ci 859cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #2 860cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 861cabdff1aSopenharmony_ci 862cabdff1aSopenharmony_ci transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 863cabdff1aSopenharmony_ci 864cabdff1aSopenharmony_ci loop_filter_8 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci // Even though only 6 pixels per row have been changed, we write the 867cabdff1aSopenharmony_ci // full 8 pixel registers. 868cabdff1aSopenharmony_ci transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 869cabdff1aSopenharmony_ci 870cabdff1aSopenharmony_ci st1 {v20.8b}, [x9], x1 871cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 872cabdff1aSopenharmony_ci st1 {v21.8b}, [x9], x1 873cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 874cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 875cabdff1aSopenharmony_ci st1 {v26.8b}, [x0], x1 876cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 877cabdff1aSopenharmony_ci st1 {v27.8b}, [x0], x1 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_ci ret x10 880cabdff1aSopenharmony_ci6: 881cabdff1aSopenharmony_ci // If we didn't need to do the flat8in part, we use the same writeback 882cabdff1aSopenharmony_ci // as in loop_filter_h_4_8. 883cabdff1aSopenharmony_ci add x9, x9, #2 884cabdff1aSopenharmony_ci add x0, x0, #2 885cabdff1aSopenharmony_ci transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 886cabdff1aSopenharmony_ci st1 {v22.s}[0], [x9], x1 887cabdff1aSopenharmony_ci st1 {v22.s}[1], [x0], x1 888cabdff1aSopenharmony_ci st1 {v23.s}[0], [x9], x1 889cabdff1aSopenharmony_ci st1 {v23.s}[1], [x0], x1 890cabdff1aSopenharmony_ci st1 {v24.s}[0], [x9], x1 891cabdff1aSopenharmony_ci st1 {v24.s}[1], [x0], x1 892cabdff1aSopenharmony_ci st1 {v25.s}[0], [x9], x1 893cabdff1aSopenharmony_ci st1 {v25.s}[1], [x0], x1 894cabdff1aSopenharmony_ci ret x10 895cabdff1aSopenharmony_ciendfunc 896cabdff1aSopenharmony_ci 897cabdff1aSopenharmony_ci.macro mix_h_16 mix 898cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_\mix\()_16_neon, export=1 899cabdff1aSopenharmony_ci mov x10, x30 900cabdff1aSopenharmony_ci sub x9, x0, #4 901cabdff1aSopenharmony_ci add x0, x9, x1, lsl #3 902cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 903cabdff1aSopenharmony_ci ld1 {v20.d}[1], [x0], x1 904cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 905cabdff1aSopenharmony_ci ld1 {v21.d}[1], [x0], x1 906cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 907cabdff1aSopenharmony_ci ld1 {v22.d}[1], [x0], x1 908cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 909cabdff1aSopenharmony_ci ld1 {v23.d}[1], [x0], x1 910cabdff1aSopenharmony_ci ld1 {v24.8b}, [x9], x1 911cabdff1aSopenharmony_ci ld1 {v24.d}[1], [x0], x1 912cabdff1aSopenharmony_ci ld1 {v25.8b}, [x9], x1 913cabdff1aSopenharmony_ci ld1 {v25.d}[1], [x0], x1 914cabdff1aSopenharmony_ci ld1 {v26.8b}, [x9], x1 915cabdff1aSopenharmony_ci ld1 {v26.d}[1], [x0], x1 916cabdff1aSopenharmony_ci ld1 {v27.8b}, [x9], x1 917cabdff1aSopenharmony_ci ld1 {v27.d}[1], [x0], x1 918cabdff1aSopenharmony_ci 919cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 920cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 921cabdff1aSopenharmony_ci 922cabdff1aSopenharmony_ci transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 923cabdff1aSopenharmony_ci 924cabdff1aSopenharmony_ci loop_filter_8_16b_mix \mix 925cabdff1aSopenharmony_ci 926cabdff1aSopenharmony_ci transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 927cabdff1aSopenharmony_ci 928cabdff1aSopenharmony_ci st1 {v20.8b}, [x9], x1 929cabdff1aSopenharmony_ci st1 {v20.d}[1], [x0], x1 930cabdff1aSopenharmony_ci st1 {v21.8b}, [x9], x1 931cabdff1aSopenharmony_ci st1 {v21.d}[1], [x0], x1 932cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 933cabdff1aSopenharmony_ci st1 {v22.d}[1], [x0], x1 934cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 935cabdff1aSopenharmony_ci st1 {v23.d}[1], [x0], x1 936cabdff1aSopenharmony_ci st1 {v24.8b}, [x9], x1 937cabdff1aSopenharmony_ci st1 {v24.d}[1], [x0], x1 938cabdff1aSopenharmony_ci st1 {v25.8b}, [x9], x1 939cabdff1aSopenharmony_ci st1 {v25.d}[1], [x0], x1 940cabdff1aSopenharmony_ci st1 {v26.8b}, [x9], x1 941cabdff1aSopenharmony_ci st1 {v26.d}[1], [x0], x1 942cabdff1aSopenharmony_ci st1 {v27.8b}, [x9], x1 943cabdff1aSopenharmony_ci st1 {v27.d}[1], [x0], x1 944cabdff1aSopenharmony_ci 945cabdff1aSopenharmony_ci ret x10 946cabdff1aSopenharmony_ci6: 947cabdff1aSopenharmony_ci add x9, x9, #2 948cabdff1aSopenharmony_ci add x0, x0, #2 949cabdff1aSopenharmony_ci transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 950cabdff1aSopenharmony_ci st1 {v22.s}[0], [x9], x1 951cabdff1aSopenharmony_ci st1 {v22.s}[2], [x0], x1 952cabdff1aSopenharmony_ci st1 {v23.s}[0], [x9], x1 953cabdff1aSopenharmony_ci st1 {v23.s}[2], [x0], x1 954cabdff1aSopenharmony_ci st1 {v24.s}[0], [x9], x1 955cabdff1aSopenharmony_ci st1 {v24.s}[2], [x0], x1 956cabdff1aSopenharmony_ci st1 {v25.s}[0], [x9], x1 957cabdff1aSopenharmony_ci st1 {v25.s}[2], [x0], x1 958cabdff1aSopenharmony_ci st1 {v22.s}[1], [x9], x1 959cabdff1aSopenharmony_ci st1 {v22.s}[3], [x0], x1 960cabdff1aSopenharmony_ci st1 {v23.s}[1], [x9], x1 961cabdff1aSopenharmony_ci st1 {v23.s}[3], [x0], x1 962cabdff1aSopenharmony_ci st1 {v24.s}[1], [x9], x1 963cabdff1aSopenharmony_ci st1 {v24.s}[3], [x0], x1 964cabdff1aSopenharmony_ci st1 {v25.s}[1], [x9], x1 965cabdff1aSopenharmony_ci st1 {v25.s}[3], [x0], x1 966cabdff1aSopenharmony_ci ret x10 967cabdff1aSopenharmony_ciendfunc 968cabdff1aSopenharmony_ci.endm 969cabdff1aSopenharmony_ci 970cabdff1aSopenharmony_cimix_h_16 48 971cabdff1aSopenharmony_cimix_h_16 84 972cabdff1aSopenharmony_cimix_h_16 88 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_8_neon, export=1 975cabdff1aSopenharmony_ci mov x10, x30 976cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 977cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 978cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 979cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 980cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #3 981cabdff1aSopenharmony_ci ld1 {v16.8b}, [x9], x1 // p7 982cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 // q0 983cabdff1aSopenharmony_ci ld1 {v17.8b}, [x9], x1 // p6 984cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 // q1 985cabdff1aSopenharmony_ci ld1 {v18.8b}, [x9], x1 // p5 986cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 // q2 987cabdff1aSopenharmony_ci ld1 {v19.8b}, [x9], x1 // p4 988cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 // q3 989cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 // p3 990cabdff1aSopenharmony_ci ld1 {v28.8b}, [x0], x1 // q4 991cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 // p2 992cabdff1aSopenharmony_ci ld1 {v29.8b}, [x0], x1 // q5 993cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 // p1 994cabdff1aSopenharmony_ci ld1 {v30.8b}, [x0], x1 // q6 995cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 // p0 996cabdff1aSopenharmony_ci ld1 {v31.8b}, [x0], x1 // q7 997cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 998cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 999cabdff1aSopenharmony_ci add x9, x9, x1 1000cabdff1aSopenharmony_ci 1001cabdff1aSopenharmony_ci loop_filter_16 1002cabdff1aSopenharmony_ci 1003cabdff1aSopenharmony_ci // If we did the flat8out part, we get the output in 1004cabdff1aSopenharmony_ci // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, 1005cabdff1aSopenharmony_ci // store v2-v9 there, and v10-v17 into x0. 1006cabdff1aSopenharmony_ci st1 {v2.8b}, [x9], x1 1007cabdff1aSopenharmony_ci st1 {v10.8b}, [x0], x1 1008cabdff1aSopenharmony_ci st1 {v3.8b}, [x9], x1 1009cabdff1aSopenharmony_ci st1 {v11.8b}, [x0], x1 1010cabdff1aSopenharmony_ci st1 {v4.8b}, [x9], x1 1011cabdff1aSopenharmony_ci st1 {v12.8b}, [x0], x1 1012cabdff1aSopenharmony_ci st1 {v5.8b}, [x9], x1 1013cabdff1aSopenharmony_ci st1 {v13.8b}, [x0], x1 1014cabdff1aSopenharmony_ci st1 {v6.8b}, [x9], x1 1015cabdff1aSopenharmony_ci st1 {v14.8b}, [x0], x1 1016cabdff1aSopenharmony_ci st1 {v8.8b}, [x9], x1 1017cabdff1aSopenharmony_ci st1 {v15.8b}, [x0], x1 1018cabdff1aSopenharmony_ci st1 {v9.8b}, [x9], x1 1019cabdff1aSopenharmony_ci st1 {v17.8b}, [x0], x1 1020cabdff1aSopenharmony_ci9: 1021cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1022cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1023cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 1024cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 1025cabdff1aSopenharmony_ci ret x10 1026cabdff1aSopenharmony_ci8: 1027cabdff1aSopenharmony_ci add x9, x9, x1, lsl #2 1028cabdff1aSopenharmony_ci // If we didn't do the flat8out part, the output is left in the 1029cabdff1aSopenharmony_ci // input registers. 1030cabdff1aSopenharmony_ci st1 {v21.8b}, [x9], x1 1031cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 1032cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 1033cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 1034cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 1035cabdff1aSopenharmony_ci st1 {v26.8b}, [x0], x1 1036cabdff1aSopenharmony_ci b 9b 1037cabdff1aSopenharmony_ci7: 1038cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #1 1039cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 1040cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 1041cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 1042cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 1043cabdff1aSopenharmony_ci b 9b 1044cabdff1aSopenharmony_ciendfunc 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_16_neon, export=1 1047cabdff1aSopenharmony_ci mov x10, x30 1048cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 1049cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 1050cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 1051cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 1052cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #3 1053cabdff1aSopenharmony_ci ld1 {v16.16b}, [x9], x1 // p7 1054cabdff1aSopenharmony_ci ld1 {v24.16b}, [x0], x1 // q0 1055cabdff1aSopenharmony_ci ld1 {v17.16b}, [x9], x1 // p6 1056cabdff1aSopenharmony_ci ld1 {v25.16b}, [x0], x1 // q1 1057cabdff1aSopenharmony_ci ld1 {v18.16b}, [x9], x1 // p5 1058cabdff1aSopenharmony_ci ld1 {v26.16b}, [x0], x1 // q2 1059cabdff1aSopenharmony_ci ld1 {v19.16b}, [x9], x1 // p4 1060cabdff1aSopenharmony_ci ld1 {v27.16b}, [x0], x1 // q3 1061cabdff1aSopenharmony_ci ld1 {v20.16b}, [x9], x1 // p3 1062cabdff1aSopenharmony_ci ld1 {v28.16b}, [x0], x1 // q4 1063cabdff1aSopenharmony_ci ld1 {v21.16b}, [x9], x1 // p2 1064cabdff1aSopenharmony_ci ld1 {v29.16b}, [x0], x1 // q5 1065cabdff1aSopenharmony_ci ld1 {v22.16b}, [x9], x1 // p1 1066cabdff1aSopenharmony_ci ld1 {v30.16b}, [x0], x1 // q6 1067cabdff1aSopenharmony_ci ld1 {v23.16b}, [x9], x1 // p0 1068cabdff1aSopenharmony_ci ld1 {v31.16b}, [x0], x1 // q7 1069cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 1070cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 1071cabdff1aSopenharmony_ci add x9, x9, x1 1072cabdff1aSopenharmony_ci 1073cabdff1aSopenharmony_ci loop_filter_16_16b 1074cabdff1aSopenharmony_ci 1075cabdff1aSopenharmony_ci st1 {v2.16b}, [x9], x1 1076cabdff1aSopenharmony_ci st1 {v10.16b}, [x0], x1 1077cabdff1aSopenharmony_ci st1 {v3.16b}, [x9], x1 1078cabdff1aSopenharmony_ci st1 {v11.16b}, [x0], x1 1079cabdff1aSopenharmony_ci st1 {v4.16b}, [x9], x1 1080cabdff1aSopenharmony_ci st1 {v12.16b}, [x0], x1 1081cabdff1aSopenharmony_ci st1 {v5.16b}, [x9], x1 1082cabdff1aSopenharmony_ci st1 {v13.16b}, [x0], x1 1083cabdff1aSopenharmony_ci st1 {v6.16b}, [x9], x1 1084cabdff1aSopenharmony_ci st1 {v14.16b}, [x0], x1 1085cabdff1aSopenharmony_ci st1 {v8.16b}, [x9], x1 1086cabdff1aSopenharmony_ci st1 {v15.16b}, [x0], x1 1087cabdff1aSopenharmony_ci st1 {v9.16b}, [x9], x1 1088cabdff1aSopenharmony_ci st1 {v17.16b}, [x0], x1 1089cabdff1aSopenharmony_ci9: 1090cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1091cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1092cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 1093cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 1094cabdff1aSopenharmony_ci ret x10 1095cabdff1aSopenharmony_ci8: 1096cabdff1aSopenharmony_ci add x9, x9, x1, lsl #2 1097cabdff1aSopenharmony_ci st1 {v21.16b}, [x9], x1 1098cabdff1aSopenharmony_ci st1 {v24.16b}, [x0], x1 1099cabdff1aSopenharmony_ci st1 {v22.16b}, [x9], x1 1100cabdff1aSopenharmony_ci st1 {v25.16b}, [x0], x1 1101cabdff1aSopenharmony_ci st1 {v23.16b}, [x9], x1 1102cabdff1aSopenharmony_ci st1 {v26.16b}, [x0], x1 1103cabdff1aSopenharmony_ci b 9b 1104cabdff1aSopenharmony_ci7: 1105cabdff1aSopenharmony_ci sub x9, x0, x1, lsl #1 1106cabdff1aSopenharmony_ci st1 {v22.16b}, [x9], x1 1107cabdff1aSopenharmony_ci st1 {v24.16b}, [x0], x1 1108cabdff1aSopenharmony_ci st1 {v23.16b}, [x9], x1 1109cabdff1aSopenharmony_ci st1 {v25.16b}, [x0], x1 1110cabdff1aSopenharmony_ci b 9b 1111cabdff1aSopenharmony_ciendfunc 1112cabdff1aSopenharmony_ci 1113cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_8_neon, export=1 1114cabdff1aSopenharmony_ci mov x10, x30 1115cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 1116cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 1117cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 1118cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 1119cabdff1aSopenharmony_ci sub x9, x0, #8 1120cabdff1aSopenharmony_ci ld1 {v16.8b}, [x9], x1 1121cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 1122cabdff1aSopenharmony_ci ld1 {v17.8b}, [x9], x1 1123cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 1124cabdff1aSopenharmony_ci ld1 {v18.8b}, [x9], x1 1125cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 1126cabdff1aSopenharmony_ci ld1 {v19.8b}, [x9], x1 1127cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 1128cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 1129cabdff1aSopenharmony_ci ld1 {v28.8b}, [x0], x1 1130cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 1131cabdff1aSopenharmony_ci ld1 {v29.8b}, [x0], x1 1132cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 1133cabdff1aSopenharmony_ci ld1 {v30.8b}, [x0], x1 1134cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 1135cabdff1aSopenharmony_ci ld1 {v31.8b}, [x0], x1 1136cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 1137cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #3 1138cabdff1aSopenharmony_ci 1139cabdff1aSopenharmony_ci // The 16x8 pixels read above is in two 8x8 blocks; the left 1140cabdff1aSopenharmony_ci // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes 1141cabdff1aSopenharmony_ci // of this, to get one column per register. 1142cabdff1aSopenharmony_ci transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 1143cabdff1aSopenharmony_ci transpose_8x8B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci loop_filter_16 1146cabdff1aSopenharmony_ci 1147cabdff1aSopenharmony_ci transpose_8x8B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 1148cabdff1aSopenharmony_ci transpose_8x8B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 1149cabdff1aSopenharmony_ci 1150cabdff1aSopenharmony_ci st1 {v16.8b}, [x9], x1 1151cabdff1aSopenharmony_ci st1 {v10.8b}, [x0], x1 1152cabdff1aSopenharmony_ci st1 {v2.8b}, [x9], x1 1153cabdff1aSopenharmony_ci st1 {v11.8b}, [x0], x1 1154cabdff1aSopenharmony_ci st1 {v3.8b}, [x9], x1 1155cabdff1aSopenharmony_ci st1 {v12.8b}, [x0], x1 1156cabdff1aSopenharmony_ci st1 {v4.8b}, [x9], x1 1157cabdff1aSopenharmony_ci st1 {v13.8b}, [x0], x1 1158cabdff1aSopenharmony_ci st1 {v5.8b}, [x9], x1 1159cabdff1aSopenharmony_ci st1 {v14.8b}, [x0], x1 1160cabdff1aSopenharmony_ci st1 {v6.8b}, [x9], x1 1161cabdff1aSopenharmony_ci st1 {v15.8b}, [x0], x1 1162cabdff1aSopenharmony_ci st1 {v8.8b}, [x9], x1 1163cabdff1aSopenharmony_ci st1 {v17.8b}, [x0], x1 1164cabdff1aSopenharmony_ci st1 {v9.8b}, [x9], x1 1165cabdff1aSopenharmony_ci st1 {v31.8b}, [x0], x1 1166cabdff1aSopenharmony_ci9: 1167cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1168cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1169cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 1170cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 1171cabdff1aSopenharmony_ci ret x10 1172cabdff1aSopenharmony_ci8: 1173cabdff1aSopenharmony_ci // The same writeback as in loop_filter_h_8_8 1174cabdff1aSopenharmony_ci sub x9, x0, #4 1175cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 1176cabdff1aSopenharmony_ci transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 1177cabdff1aSopenharmony_ci 1178cabdff1aSopenharmony_ci st1 {v20.8b}, [x9], x1 1179cabdff1aSopenharmony_ci st1 {v24.8b}, [x0], x1 1180cabdff1aSopenharmony_ci st1 {v21.8b}, [x9], x1 1181cabdff1aSopenharmony_ci st1 {v25.8b}, [x0], x1 1182cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 1183cabdff1aSopenharmony_ci st1 {v26.8b}, [x0], x1 1184cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 1185cabdff1aSopenharmony_ci st1 {v27.8b}, [x0], x1 1186cabdff1aSopenharmony_ci b 9b 1187cabdff1aSopenharmony_ci7: 1188cabdff1aSopenharmony_ci // The same writeback as in loop_filter_h_4_8 1189cabdff1aSopenharmony_ci sub x9, x0, #2 1190cabdff1aSopenharmony_ci add x0, x9, x1, lsl #2 1191cabdff1aSopenharmony_ci transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 1192cabdff1aSopenharmony_ci st1 {v22.s}[0], [x9], x1 1193cabdff1aSopenharmony_ci st1 {v22.s}[1], [x0], x1 1194cabdff1aSopenharmony_ci st1 {v23.s}[0], [x9], x1 1195cabdff1aSopenharmony_ci st1 {v23.s}[1], [x0], x1 1196cabdff1aSopenharmony_ci st1 {v24.s}[0], [x9], x1 1197cabdff1aSopenharmony_ci st1 {v24.s}[1], [x0], x1 1198cabdff1aSopenharmony_ci st1 {v25.s}[0], [x9], x1 1199cabdff1aSopenharmony_ci st1 {v25.s}[1], [x0], x1 1200cabdff1aSopenharmony_ci b 9b 1201cabdff1aSopenharmony_ciendfunc 1202cabdff1aSopenharmony_ci 1203cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_16_neon, export=1 1204cabdff1aSopenharmony_ci mov x10, x30 1205cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 1206cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 1207cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 1208cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 1209cabdff1aSopenharmony_ci sub x9, x0, #8 1210cabdff1aSopenharmony_ci ld1 {v16.8b}, [x9], x1 1211cabdff1aSopenharmony_ci ld1 {v24.8b}, [x0], x1 1212cabdff1aSopenharmony_ci ld1 {v17.8b}, [x9], x1 1213cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0], x1 1214cabdff1aSopenharmony_ci ld1 {v18.8b}, [x9], x1 1215cabdff1aSopenharmony_ci ld1 {v26.8b}, [x0], x1 1216cabdff1aSopenharmony_ci ld1 {v19.8b}, [x9], x1 1217cabdff1aSopenharmony_ci ld1 {v27.8b}, [x0], x1 1218cabdff1aSopenharmony_ci ld1 {v20.8b}, [x9], x1 1219cabdff1aSopenharmony_ci ld1 {v28.8b}, [x0], x1 1220cabdff1aSopenharmony_ci ld1 {v21.8b}, [x9], x1 1221cabdff1aSopenharmony_ci ld1 {v29.8b}, [x0], x1 1222cabdff1aSopenharmony_ci ld1 {v22.8b}, [x9], x1 1223cabdff1aSopenharmony_ci ld1 {v30.8b}, [x0], x1 1224cabdff1aSopenharmony_ci ld1 {v23.8b}, [x9], x1 1225cabdff1aSopenharmony_ci ld1 {v31.8b}, [x0], x1 1226cabdff1aSopenharmony_ci ld1 {v16.d}[1], [x9], x1 1227cabdff1aSopenharmony_ci ld1 {v24.d}[1], [x0], x1 1228cabdff1aSopenharmony_ci ld1 {v17.d}[1], [x9], x1 1229cabdff1aSopenharmony_ci ld1 {v25.d}[1], [x0], x1 1230cabdff1aSopenharmony_ci ld1 {v18.d}[1], [x9], x1 1231cabdff1aSopenharmony_ci ld1 {v26.d}[1], [x0], x1 1232cabdff1aSopenharmony_ci ld1 {v19.d}[1], [x9], x1 1233cabdff1aSopenharmony_ci ld1 {v27.d}[1], [x0], x1 1234cabdff1aSopenharmony_ci ld1 {v20.d}[1], [x9], x1 1235cabdff1aSopenharmony_ci ld1 {v28.d}[1], [x0], x1 1236cabdff1aSopenharmony_ci ld1 {v21.d}[1], [x9], x1 1237cabdff1aSopenharmony_ci ld1 {v29.d}[1], [x0], x1 1238cabdff1aSopenharmony_ci ld1 {v22.d}[1], [x9], x1 1239cabdff1aSopenharmony_ci ld1 {v30.d}[1], [x0], x1 1240cabdff1aSopenharmony_ci ld1 {v23.d}[1], [x9], x1 1241cabdff1aSopenharmony_ci ld1 {v31.d}[1], [x0], x1 1242cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #4 1243cabdff1aSopenharmony_ci sub x9, x9, x1, lsl #4 1244cabdff1aSopenharmony_ci 1245cabdff1aSopenharmony_ci transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 1246cabdff1aSopenharmony_ci transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 1247cabdff1aSopenharmony_ci 1248cabdff1aSopenharmony_ci loop_filter_16_16b 1249cabdff1aSopenharmony_ci 1250cabdff1aSopenharmony_ci transpose_8x16B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 1251cabdff1aSopenharmony_ci transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 1252cabdff1aSopenharmony_ci 1253cabdff1aSopenharmony_ci st1 {v16.8b}, [x9], x1 1254cabdff1aSopenharmony_ci st1 {v10.8b}, [x0], x1 1255cabdff1aSopenharmony_ci st1 {v2.8b}, [x9], x1 1256cabdff1aSopenharmony_ci st1 {v11.8b}, [x0], x1 1257cabdff1aSopenharmony_ci st1 {v3.8b}, [x9], x1 1258cabdff1aSopenharmony_ci st1 {v12.8b}, [x0], x1 1259cabdff1aSopenharmony_ci st1 {v4.8b}, [x9], x1 1260cabdff1aSopenharmony_ci st1 {v13.8b}, [x0], x1 1261cabdff1aSopenharmony_ci st1 {v5.8b}, [x9], x1 1262cabdff1aSopenharmony_ci st1 {v14.8b}, [x0], x1 1263cabdff1aSopenharmony_ci st1 {v6.8b}, [x9], x1 1264cabdff1aSopenharmony_ci st1 {v15.8b}, [x0], x1 1265cabdff1aSopenharmony_ci st1 {v8.8b}, [x9], x1 1266cabdff1aSopenharmony_ci st1 {v17.8b}, [x0], x1 1267cabdff1aSopenharmony_ci st1 {v9.8b}, [x9], x1 1268cabdff1aSopenharmony_ci st1 {v31.8b}, [x0], x1 1269cabdff1aSopenharmony_ci st1 {v16.d}[1], [x9], x1 1270cabdff1aSopenharmony_ci st1 {v10.d}[1], [x0], x1 1271cabdff1aSopenharmony_ci st1 {v2.d}[1], [x9], x1 1272cabdff1aSopenharmony_ci st1 {v11.d}[1], [x0], x1 1273cabdff1aSopenharmony_ci st1 {v3.d}[1], [x9], x1 1274cabdff1aSopenharmony_ci st1 {v12.d}[1], [x0], x1 1275cabdff1aSopenharmony_ci st1 {v4.d}[1], [x9], x1 1276cabdff1aSopenharmony_ci st1 {v13.d}[1], [x0], x1 1277cabdff1aSopenharmony_ci st1 {v5.d}[1], [x9], x1 1278cabdff1aSopenharmony_ci st1 {v14.d}[1], [x0], x1 1279cabdff1aSopenharmony_ci st1 {v6.d}[1], [x9], x1 1280cabdff1aSopenharmony_ci st1 {v15.d}[1], [x0], x1 1281cabdff1aSopenharmony_ci st1 {v8.d}[1], [x9], x1 1282cabdff1aSopenharmony_ci st1 {v17.d}[1], [x0], x1 1283cabdff1aSopenharmony_ci st1 {v9.d}[1], [x9], x1 1284cabdff1aSopenharmony_ci st1 {v31.d}[1], [x0], x1 1285cabdff1aSopenharmony_ci9: 1286cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1287cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1288cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 1289cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 1290cabdff1aSopenharmony_ci ret x10 1291cabdff1aSopenharmony_ci8: 1292cabdff1aSopenharmony_ci sub x9, x0, #4 1293cabdff1aSopenharmony_ci add x0, x9, x1, lsl #3 1294cabdff1aSopenharmony_ci transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 1295cabdff1aSopenharmony_ci 1296cabdff1aSopenharmony_ci st1 {v20.8b}, [x9], x1 1297cabdff1aSopenharmony_ci st1 {v20.d}[1], [x0], x1 1298cabdff1aSopenharmony_ci st1 {v21.8b}, [x9], x1 1299cabdff1aSopenharmony_ci st1 {v21.d}[1], [x0], x1 1300cabdff1aSopenharmony_ci st1 {v22.8b}, [x9], x1 1301cabdff1aSopenharmony_ci st1 {v22.d}[1], [x0], x1 1302cabdff1aSopenharmony_ci st1 {v23.8b}, [x9], x1 1303cabdff1aSopenharmony_ci st1 {v23.d}[1], [x0], x1 1304cabdff1aSopenharmony_ci st1 {v24.8b}, [x9], x1 1305cabdff1aSopenharmony_ci st1 {v24.d}[1], [x0], x1 1306cabdff1aSopenharmony_ci st1 {v25.8b}, [x9], x1 1307cabdff1aSopenharmony_ci st1 {v25.d}[1], [x0], x1 1308cabdff1aSopenharmony_ci st1 {v26.8b}, [x9], x1 1309cabdff1aSopenharmony_ci st1 {v26.d}[1], [x0], x1 1310cabdff1aSopenharmony_ci st1 {v27.8b}, [x9], x1 1311cabdff1aSopenharmony_ci st1 {v27.d}[1], [x0], x1 1312cabdff1aSopenharmony_ci b 9b 1313cabdff1aSopenharmony_ci7: 1314cabdff1aSopenharmony_ci sub x9, x0, #2 1315cabdff1aSopenharmony_ci add x0, x9, x1, lsl #3 1316cabdff1aSopenharmony_ci transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 1317cabdff1aSopenharmony_ci st1 {v22.s}[0], [x9], x1 1318cabdff1aSopenharmony_ci st1 {v22.s}[2], [x0], x1 1319cabdff1aSopenharmony_ci st1 {v23.s}[0], [x9], x1 1320cabdff1aSopenharmony_ci st1 {v23.s}[2], [x0], x1 1321cabdff1aSopenharmony_ci st1 {v24.s}[0], [x9], x1 1322cabdff1aSopenharmony_ci st1 {v24.s}[2], [x0], x1 1323cabdff1aSopenharmony_ci st1 {v25.s}[0], [x9], x1 1324cabdff1aSopenharmony_ci st1 {v25.s}[2], [x0], x1 1325cabdff1aSopenharmony_ci st1 {v22.s}[1], [x9], x1 1326cabdff1aSopenharmony_ci st1 {v22.s}[3], [x0], x1 1327cabdff1aSopenharmony_ci st1 {v23.s}[1], [x9], x1 1328cabdff1aSopenharmony_ci st1 {v23.s}[3], [x0], x1 1329cabdff1aSopenharmony_ci st1 {v24.s}[1], [x9], x1 1330cabdff1aSopenharmony_ci st1 {v24.s}[3], [x0], x1 1331cabdff1aSopenharmony_ci st1 {v25.s}[1], [x9], x1 1332cabdff1aSopenharmony_ci st1 {v25.s}[3], [x0], x1 1333cabdff1aSopenharmony_ci b 9b 1334cabdff1aSopenharmony_ciendfunc 1335