1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * VP8 NEON optimisations 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2010 Rob Clark <rob@ti.com> 5cabdff1aSopenharmony_ci * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 6cabdff1aSopenharmony_ci * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com> 7cabdff1aSopenharmony_ci * Copyright (c) 2019 Martin Storsjo <martin@martin.st> 8cabdff1aSopenharmony_ci * 9cabdff1aSopenharmony_ci * This file is part of FFmpeg. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 12cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 13cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 14cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 17cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 18cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19cabdff1aSopenharmony_ci * Lesser General Public License for more details. 20cabdff1aSopenharmony_ci * 21cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 22cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 23cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24cabdff1aSopenharmony_ci */ 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 27cabdff1aSopenharmony_ci#include "neon.S" 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cifunction ff_vp8_luma_dc_wht_neon, export=1 30cabdff1aSopenharmony_ci ld1 {v0.4h - v3.4h}, [x1] 31cabdff1aSopenharmony_ci movi v30.8h, #0 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci add v4.4h, v0.4h, v3.4h 34cabdff1aSopenharmony_ci add v6.4h, v1.4h, v2.4h 35cabdff1aSopenharmony_ci st1 {v30.8h}, [x1], #16 36cabdff1aSopenharmony_ci sub v7.4h, v1.4h, v2.4h 37cabdff1aSopenharmony_ci sub v5.4h, v0.4h, v3.4h 38cabdff1aSopenharmony_ci st1 {v30.8h}, [x1] 39cabdff1aSopenharmony_ci add v0.4h, v4.4h, v6.4h 40cabdff1aSopenharmony_ci add v1.4h, v5.4h, v7.4h 41cabdff1aSopenharmony_ci sub v2.4h, v4.4h, v6.4h 42cabdff1aSopenharmony_ci sub v3.4h, v5.4h, v7.4h 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci movi v16.4h, #3 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_ci transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci add v0.4h, v0.4h, v16.4h 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_ci add v4.4h, v0.4h, v3.4h 51cabdff1aSopenharmony_ci add v6.4h, v1.4h, v2.4h 52cabdff1aSopenharmony_ci sub v7.4h, v1.4h, v2.4h 53cabdff1aSopenharmony_ci sub v5.4h, v0.4h, v3.4h 54cabdff1aSopenharmony_ci add v0.4h, v4.4h, v6.4h 55cabdff1aSopenharmony_ci add v1.4h, v5.4h, v7.4h 56cabdff1aSopenharmony_ci sub v2.4h, v4.4h, v6.4h 57cabdff1aSopenharmony_ci sub v3.4h, v5.4h, v7.4h 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci sshr v0.4h, v0.4h, #3 60cabdff1aSopenharmony_ci sshr v1.4h, v1.4h, #3 61cabdff1aSopenharmony_ci sshr v2.4h, v2.4h, #3 62cabdff1aSopenharmony_ci sshr v3.4h, v3.4h, #3 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci mov x3, #32 65cabdff1aSopenharmony_ci st1 {v0.h}[0], [x0], x3 66cabdff1aSopenharmony_ci st1 {v1.h}[0], [x0], x3 67cabdff1aSopenharmony_ci st1 {v2.h}[0], [x0], x3 68cabdff1aSopenharmony_ci st1 {v3.h}[0], [x0], x3 69cabdff1aSopenharmony_ci st1 {v0.h}[1], [x0], x3 70cabdff1aSopenharmony_ci st1 {v1.h}[1], [x0], x3 71cabdff1aSopenharmony_ci st1 {v2.h}[1], [x0], x3 72cabdff1aSopenharmony_ci st1 {v3.h}[1], [x0], x3 73cabdff1aSopenharmony_ci st1 {v0.h}[2], [x0], x3 74cabdff1aSopenharmony_ci st1 {v1.h}[2], [x0], x3 75cabdff1aSopenharmony_ci st1 {v2.h}[2], [x0], x3 76cabdff1aSopenharmony_ci st1 {v3.h}[2], [x0], x3 77cabdff1aSopenharmony_ci st1 {v0.h}[3], [x0], x3 78cabdff1aSopenharmony_ci st1 {v1.h}[3], [x0], x3 79cabdff1aSopenharmony_ci st1 {v2.h}[3], [x0], x3 80cabdff1aSopenharmony_ci st1 {v3.h}[3], [x0], x3 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci ret 83cabdff1aSopenharmony_ciendfunc 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_cifunction ff_vp8_idct_add_neon, export=1 86cabdff1aSopenharmony_ci ld1 {v0.8b - v3.8b}, [x1] 87cabdff1aSopenharmony_ci mov w4, #20091 88cabdff1aSopenharmony_ci movk w4, #35468/2, lsl #16 89cabdff1aSopenharmony_ci dup v4.2s, w4 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci smull v26.4s, v1.4h, v4.h[0] 92cabdff1aSopenharmony_ci smull v27.4s, v3.4h, v4.h[0] 93cabdff1aSopenharmony_ci sqdmulh v20.4h, v1.4h, v4.h[1] 94cabdff1aSopenharmony_ci sqdmulh v23.4h, v3.4h, v4.h[1] 95cabdff1aSopenharmony_ci shrn v21.4h, v26.4s, #16 96cabdff1aSopenharmony_ci shrn v22.4h, v27.4s, #16 97cabdff1aSopenharmony_ci add v21.4h, v21.4h, v1.4h 98cabdff1aSopenharmony_ci add v22.4h, v22.4h, v3.4h 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci add v16.4h, v0.4h, v2.4h 101cabdff1aSopenharmony_ci sub v17.4h, v0.4h, v2.4h 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci add v18.4h, v21.4h, v23.4h 104cabdff1aSopenharmony_ci sub v19.4h, v20.4h, v22.4h 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci add v0.4h, v16.4h, v18.4h 107cabdff1aSopenharmony_ci add v1.4h, v17.4h, v19.4h 108cabdff1aSopenharmony_ci sub v3.4h, v16.4h, v18.4h 109cabdff1aSopenharmony_ci sub v2.4h, v17.4h, v19.4h 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci movi v29.8h, #0 114cabdff1aSopenharmony_ci smull v26.4s, v1.4h, v4.h[0] 115cabdff1aSopenharmony_ci st1 {v29.8h}, [x1], #16 116cabdff1aSopenharmony_ci smull v27.4s, v3.4h, v4.h[0] 117cabdff1aSopenharmony_ci st1 {v29.16b}, [x1] 118cabdff1aSopenharmony_ci sqdmulh v21.4h, v1.4h, v4.h[1] 119cabdff1aSopenharmony_ci sqdmulh v23.4h, v3.4h, v4.h[1] 120cabdff1aSopenharmony_ci shrn v20.4h, v26.4s, #16 121cabdff1aSopenharmony_ci shrn v22.4h, v27.4s, #16 122cabdff1aSopenharmony_ci add v20.4h, v20.4h, v1.4h 123cabdff1aSopenharmony_ci add v22.4h, v22.4h, v3.4h 124cabdff1aSopenharmony_ci add v16.4h, v0.4h, v2.4h 125cabdff1aSopenharmony_ci sub v17.4h, v0.4h, v2.4h 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci add v18.4h, v20.4h, v23.4h 128cabdff1aSopenharmony_ci ld1 {v24.s}[0], [x0], x2 129cabdff1aSopenharmony_ci sub v19.4h, v21.4h, v22.4h 130cabdff1aSopenharmony_ci ld1 {v25.s}[0], [x0], x2 131cabdff1aSopenharmony_ci add v0.4h, v16.4h, v18.4h 132cabdff1aSopenharmony_ci add v1.4h, v17.4h, v19.4h 133cabdff1aSopenharmony_ci ld1 {v26.s}[0], [x0], x2 134cabdff1aSopenharmony_ci sub v3.4h, v16.4h, v18.4h 135cabdff1aSopenharmony_ci sub v2.4h, v17.4h, v19.4h 136cabdff1aSopenharmony_ci ld1 {v27.s}[0], [x0], x2 137cabdff1aSopenharmony_ci srshr v0.4h, v0.4h, #3 138cabdff1aSopenharmony_ci srshr v1.4h, v1.4h, #3 139cabdff1aSopenharmony_ci srshr v2.4h, v2.4h, #3 140cabdff1aSopenharmony_ci srshr v3.4h, v3.4h, #3 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #2 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci uaddw v0.8h, v0.8h, v24.8b 147cabdff1aSopenharmony_ci uaddw v1.8h, v1.8h, v25.8b 148cabdff1aSopenharmony_ci uaddw v2.8h, v2.8h, v26.8b 149cabdff1aSopenharmony_ci uaddw v3.8h, v3.8h, v27.8b 150cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 151cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 152cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 153cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci st1 {v0.s}[0], [x0], x2 156cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0], x2 157cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x2 158cabdff1aSopenharmony_ci st1 {v3.s}[0], [x0], x2 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci ret 161cabdff1aSopenharmony_ciendfunc 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_cifunction ff_vp8_idct_dc_add4uv_neon, export=1 164cabdff1aSopenharmony_ci movi v0.4h, #0 165cabdff1aSopenharmony_ci mov x3, #32 166cabdff1aSopenharmony_ci ld1r {v16.4h}, [x1] 167cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 168cabdff1aSopenharmony_ci ld1r {v17.4h}, [x1] 169cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 170cabdff1aSopenharmony_ci ld1r {v18.4h}, [x1] 171cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 172cabdff1aSopenharmony_ci ld1r {v19.4h}, [x1] 173cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 174cabdff1aSopenharmony_ci ins v16.d[1], v17.d[0] 175cabdff1aSopenharmony_ci ins v18.d[1], v19.d[0] 176cabdff1aSopenharmony_ci mov x3, x0 177cabdff1aSopenharmony_ci srshr v16.8h, v16.8h, #3 // dc >>= 3 178cabdff1aSopenharmony_ci ld1 {v0.8b}, [x0], x2 179cabdff1aSopenharmony_ci srshr v18.8h, v18.8h, #3 180cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x2 181cabdff1aSopenharmony_ci uaddw v20.8h, v16.8h, v0.8b 182cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x2 183cabdff1aSopenharmony_ci uaddw v0.8h, v16.8h, v1.8b 184cabdff1aSopenharmony_ci ld1 {v3.8b}, [x0], x2 185cabdff1aSopenharmony_ci uaddw v22.8h, v16.8h, v2.8b 186cabdff1aSopenharmony_ci ld1 {v4.8b}, [x0], x2 187cabdff1aSopenharmony_ci uaddw v2.8h, v16.8h, v3.8b 188cabdff1aSopenharmony_ci ld1 {v5.8b}, [x0], x2 189cabdff1aSopenharmony_ci uaddw v24.8h, v18.8h, v4.8b 190cabdff1aSopenharmony_ci ld1 {v6.8b}, [x0], x2 191cabdff1aSopenharmony_ci uaddw v4.8h, v18.8h, v5.8b 192cabdff1aSopenharmony_ci ld1 {v7.8b}, [x0], x2 193cabdff1aSopenharmony_ci uaddw v26.8h, v18.8h, v6.8b 194cabdff1aSopenharmony_ci sqxtun v20.8b, v20.8h 195cabdff1aSopenharmony_ci uaddw v6.8h, v18.8h, v7.8b 196cabdff1aSopenharmony_ci sqxtun v21.8b, v0.8h 197cabdff1aSopenharmony_ci sqxtun v22.8b, v22.8h 198cabdff1aSopenharmony_ci st1 {v20.8b}, [x3], x2 199cabdff1aSopenharmony_ci sqxtun v23.8b, v2.8h 200cabdff1aSopenharmony_ci st1 {v21.8b}, [x3], x2 201cabdff1aSopenharmony_ci sqxtun v24.8b, v24.8h 202cabdff1aSopenharmony_ci st1 {v22.8b}, [x3], x2 203cabdff1aSopenharmony_ci sqxtun v25.8b, v4.8h 204cabdff1aSopenharmony_ci st1 {v23.8b}, [x3], x2 205cabdff1aSopenharmony_ci sqxtun v26.8b, v26.8h 206cabdff1aSopenharmony_ci st1 {v24.8b}, [x3], x2 207cabdff1aSopenharmony_ci sqxtun v27.8b, v6.8h 208cabdff1aSopenharmony_ci st1 {v25.8b}, [x3], x2 209cabdff1aSopenharmony_ci st1 {v26.8b}, [x3], x2 210cabdff1aSopenharmony_ci st1 {v27.8b}, [x3], x2 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci ret 213cabdff1aSopenharmony_ciendfunc 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_cifunction ff_vp8_idct_dc_add4y_neon, export=1 216cabdff1aSopenharmony_ci movi v0.16b, #0 217cabdff1aSopenharmony_ci mov x3, #32 218cabdff1aSopenharmony_ci ld1r {v16.4h}, [x1] 219cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 220cabdff1aSopenharmony_ci ld1r {v17.4h}, [x1] 221cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 222cabdff1aSopenharmony_ci zip1 v16.2d, v16.2d, v17.2d 223cabdff1aSopenharmony_ci ld1r {v18.4h}, [x1] 224cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 225cabdff1aSopenharmony_ci ld1r {v19.4h}, [x1] 226cabdff1aSopenharmony_ci st1 {v0.h}[0], [x1], x3 227cabdff1aSopenharmony_ci zip1 v18.2d, v18.2d, v19.2d 228cabdff1aSopenharmony_ci srshr v16.8h, v16.8h, #3 // dc >>= 3 229cabdff1aSopenharmony_ci ld1 {v0.16b}, [x0], x2 230cabdff1aSopenharmony_ci srshr v18.8h, v18.8h, #3 231cabdff1aSopenharmony_ci ld1 {v1.16b}, [x0], x2 232cabdff1aSopenharmony_ci uaddw v20.8h, v16.8h, v0.8b 233cabdff1aSopenharmony_ci ld1 {v2.16b}, [x0], x2 234cabdff1aSopenharmony_ci uaddw2 v0.8h, v18.8h, v0.16b 235cabdff1aSopenharmony_ci ld1 {v3.16b}, [x0], x2 236cabdff1aSopenharmony_ci uaddw v21.8h, v16.8h, v1.8b 237cabdff1aSopenharmony_ci uaddw2 v1.8h, v18.8h, v1.16b 238cabdff1aSopenharmony_ci uaddw v22.8h, v16.8h, v2.8b 239cabdff1aSopenharmony_ci uaddw2 v2.8h, v18.8h, v2.16b 240cabdff1aSopenharmony_ci uaddw v23.8h, v16.8h, v3.8b 241cabdff1aSopenharmony_ci uaddw2 v3.8h, v18.8h, v3.16b 242cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #2 243cabdff1aSopenharmony_ci sqxtun v20.8b, v20.8h 244cabdff1aSopenharmony_ci sqxtun2 v20.16b, v0.8h 245cabdff1aSopenharmony_ci sqxtun v21.8b, v21.8h 246cabdff1aSopenharmony_ci sqxtun2 v21.16b, v1.8h 247cabdff1aSopenharmony_ci sqxtun v22.8b, v22.8h 248cabdff1aSopenharmony_ci st1 {v20.16b}, [x0], x2 249cabdff1aSopenharmony_ci sqxtun2 v22.16b, v2.8h 250cabdff1aSopenharmony_ci st1 {v21.16b}, [x0], x2 251cabdff1aSopenharmony_ci sqxtun v23.8b, v23.8h 252cabdff1aSopenharmony_ci st1 {v22.16b}, [x0], x2 253cabdff1aSopenharmony_ci sqxtun2 v23.16b, v3.8h 254cabdff1aSopenharmony_ci st1 {v23.16b}, [x0], x2 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci ret 257cabdff1aSopenharmony_ciendfunc 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_cifunction ff_vp8_idct_dc_add_neon, export=1 260cabdff1aSopenharmony_ci mov w3, #0 261cabdff1aSopenharmony_ci ld1r {v2.8h}, [x1] 262cabdff1aSopenharmony_ci strh w3, [x1] 263cabdff1aSopenharmony_ci srshr v2.8h, v2.8h, #3 264cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x0], x2 265cabdff1aSopenharmony_ci ld1 {v0.s}[1], [x0], x2 266cabdff1aSopenharmony_ci uaddw v3.8h, v2.8h, v0.8b 267cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x0], x2 268cabdff1aSopenharmony_ci ld1 {v1.s}[1], [x0], x2 269cabdff1aSopenharmony_ci uaddw v4.8h, v2.8h, v1.8b 270cabdff1aSopenharmony_ci sqxtun v0.8b, v3.8h 271cabdff1aSopenharmony_ci sqxtun v1.8b, v4.8h 272cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #2 273cabdff1aSopenharmony_ci st1 {v0.s}[0], [x0], x2 274cabdff1aSopenharmony_ci st1 {v0.s}[1], [x0], x2 275cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0], x2 276cabdff1aSopenharmony_ci st1 {v1.s}[1], [x0], x2 277cabdff1aSopenharmony_ci ret 278cabdff1aSopenharmony_ciendfunc 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci// Register layout: 281cabdff1aSopenharmony_ci// P3..Q3 -> v0..v7 282cabdff1aSopenharmony_ci// flim_E -> v22 283cabdff1aSopenharmony_ci// flim_I -> v23 284cabdff1aSopenharmony_ci// hev_thresh -> x5 285cabdff1aSopenharmony_ci// 286cabdff1aSopenharmony_ci.macro vp8_loop_filter, inner=0, simple=0, hev_thresh 287cabdff1aSopenharmony_ci .if \simple 288cabdff1aSopenharmony_ci uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) 289cabdff1aSopenharmony_ci uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) 290cabdff1aSopenharmony_ci uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 291cabdff1aSopenharmony_ci ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 292cabdff1aSopenharmony_ci uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 293cabdff1aSopenharmony_ci movi v21.16b, #0x80 294cabdff1aSopenharmony_ci cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim 295cabdff1aSopenharmony_ci .else 296cabdff1aSopenharmony_ci // calculate hev and normal_limit: 297cabdff1aSopenharmony_ci uabd v20.16b, v2.16b, v3.16b // abs(P1-P0) 298cabdff1aSopenharmony_ci uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0) 299cabdff1aSopenharmony_ci uabd v18.16b, v0.16b, v1.16b // abs(P3-P2) 300cabdff1aSopenharmony_ci uabd v19.16b, v1.16b, v2.16b // abs(P2-P1) 301cabdff1aSopenharmony_ci cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I 302cabdff1aSopenharmony_ci cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I 303cabdff1aSopenharmony_ci cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I 304cabdff1aSopenharmony_ci cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I 305cabdff1aSopenharmony_ci and v16.16b, v17.16b, v16.16b 306cabdff1aSopenharmony_ci uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2) 307cabdff1aSopenharmony_ci and v16.16b, v16.16b, v19.16b 308cabdff1aSopenharmony_ci uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1) 309cabdff1aSopenharmony_ci and v16.16b, v16.16b, v18.16b 310cabdff1aSopenharmony_ci cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I 311cabdff1aSopenharmony_ci cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I 312cabdff1aSopenharmony_ci uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) 313cabdff1aSopenharmony_ci uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) 314cabdff1aSopenharmony_ci and v16.16b, v16.16b, v18.16b 315cabdff1aSopenharmony_ci uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 316cabdff1aSopenharmony_ci and v16.16b, v16.16b, v19.16b 317cabdff1aSopenharmony_ci ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 318cabdff1aSopenharmony_ci dup v23.16b, \hev_thresh // hev_thresh 319cabdff1aSopenharmony_ci uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 320cabdff1aSopenharmony_ci cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh 321cabdff1aSopenharmony_ci cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E 322cabdff1aSopenharmony_ci cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh 323cabdff1aSopenharmony_ci and v16.16b, v16.16b, v19.16b 324cabdff1aSopenharmony_ci movi v21.16b, #0x80 325cabdff1aSopenharmony_ci orr v17.16b, v20.16b, v22.16b 326cabdff1aSopenharmony_ci .endif 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci // at this point: 329cabdff1aSopenharmony_ci // v16: normal_limit 330cabdff1aSopenharmony_ci // v17: hev 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci // convert to signed value: 333cabdff1aSopenharmony_ci eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 334cabdff1aSopenharmony_ci eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci movi v20.8h, #3 337cabdff1aSopenharmony_ci ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 338cabdff1aSopenharmony_ci ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) 339cabdff1aSopenharmony_ci eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 340cabdff1aSopenharmony_ci eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 341cabdff1aSopenharmony_ci mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) 342cabdff1aSopenharmony_ci mul v19.8h, v19.8h, v20.8h 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) 345cabdff1aSopenharmony_ci movi v22.16b, #4 346cabdff1aSopenharmony_ci movi v23.16b, #3 347cabdff1aSopenharmony_ci .if \inner 348cabdff1aSopenharmony_ci and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) 349cabdff1aSopenharmony_ci .endif 350cabdff1aSopenharmony_ci saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) 351cabdff1aSopenharmony_ci saddw2 v19.8h, v19.8h, v20.16b 352cabdff1aSopenharmony_ci sqxtn v18.8b, v18.8h // narrow result back into v18 353cabdff1aSopenharmony_ci sqxtn2 v18.16b, v19.8h 354cabdff1aSopenharmony_ci .if !\inner && !\simple 355cabdff1aSopenharmony_ci eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 356cabdff1aSopenharmony_ci eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 357cabdff1aSopenharmony_ci .endif 358cabdff1aSopenharmony_ci and v18.16b, v18.16b, v16.16b // w &= normal_limit 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci // registers used at this point.. 361cabdff1aSopenharmony_ci // v0 -> P3 (don't corrupt) 362cabdff1aSopenharmony_ci // v1-v6 -> PS2-QS2 363cabdff1aSopenharmony_ci // v7 -> Q3 (don't corrupt) 364cabdff1aSopenharmony_ci // v17 -> hev 365cabdff1aSopenharmony_ci // v18 -> w 366cabdff1aSopenharmony_ci // v21 -> #0x80 367cabdff1aSopenharmony_ci // v22 -> #4 368cabdff1aSopenharmony_ci // v23 -> #3 369cabdff1aSopenharmony_ci // v16, v19, v29 -> unused 370cabdff1aSopenharmony_ci // 371cabdff1aSopenharmony_ci // filter_common: is4tap==1 372cabdff1aSopenharmony_ci // c1 = clamp(w + 4) >> 3; 373cabdff1aSopenharmony_ci // c2 = clamp(w + 3) >> 3; 374cabdff1aSopenharmony_ci // Q0 = s2u(QS0 - c1); 375cabdff1aSopenharmony_ci // P0 = s2u(PS0 + c2); 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci .if \simple 378cabdff1aSopenharmony_ci sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) 379cabdff1aSopenharmony_ci sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) 380cabdff1aSopenharmony_ci sshr v19.16b, v19.16b, #3 // c1 >>= 3 381cabdff1aSopenharmony_ci sshr v20.16b, v20.16b, #3 // c2 >>= 3 382cabdff1aSopenharmony_ci sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) 383cabdff1aSopenharmony_ci sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) 384cabdff1aSopenharmony_ci eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 385cabdff1aSopenharmony_ci eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 386cabdff1aSopenharmony_ci eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 387cabdff1aSopenharmony_ci eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 388cabdff1aSopenharmony_ci .elseif \inner 389cabdff1aSopenharmony_ci // the !is4tap case of filter_common, only used for inner blocks 390cabdff1aSopenharmony_ci // c3 = ((c1&~hev) + 1) >> 1; 391cabdff1aSopenharmony_ci // Q1 = s2u(QS1 - c3); 392cabdff1aSopenharmony_ci // P1 = s2u(PS1 + c3); 393cabdff1aSopenharmony_ci sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) 394cabdff1aSopenharmony_ci sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) 395cabdff1aSopenharmony_ci sshr v19.16b, v19.16b, #3 // c1 >>= 3 396cabdff1aSopenharmony_ci sshr v20.16b, v20.16b, #3 // c2 >>= 3 397cabdff1aSopenharmony_ci sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) 398cabdff1aSopenharmony_ci sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) 399cabdff1aSopenharmony_ci bic v19.16b, v19.16b, v17.16b // c1 & ~hev 400cabdff1aSopenharmony_ci eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 401cabdff1aSopenharmony_ci srshr v19.16b, v19.16b, #1 // c3 >>= 1 402cabdff1aSopenharmony_ci eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 403cabdff1aSopenharmony_ci sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) 404cabdff1aSopenharmony_ci sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) 405cabdff1aSopenharmony_ci eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 406cabdff1aSopenharmony_ci eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 407cabdff1aSopenharmony_ci .else 408cabdff1aSopenharmony_ci and v20.16b, v18.16b, v17.16b // w & hev 409cabdff1aSopenharmony_ci sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) 410cabdff1aSopenharmony_ci sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) 411cabdff1aSopenharmony_ci sshr v19.16b, v19.16b, #3 // c1 >>= 3 412cabdff1aSopenharmony_ci sshr v20.16b, v20.16b, #3 // c2 >>= 3 413cabdff1aSopenharmony_ci bic v18.16b, v18.16b, v17.16b // w &= ~hev 414cabdff1aSopenharmony_ci sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) 415cabdff1aSopenharmony_ci sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_ci // filter_mbedge: 418cabdff1aSopenharmony_ci // a = clamp((27*w + 63) >> 7); 419cabdff1aSopenharmony_ci // Q0 = s2u(QS0 - a); 420cabdff1aSopenharmony_ci // P0 = s2u(PS0 + a); 421cabdff1aSopenharmony_ci // a = clamp((18*w + 63) >> 7); 422cabdff1aSopenharmony_ci // Q1 = s2u(QS1 - a); 423cabdff1aSopenharmony_ci // P1 = s2u(PS1 + a); 424cabdff1aSopenharmony_ci // a = clamp((9*w + 63) >> 7); 425cabdff1aSopenharmony_ci // Q2 = s2u(QS2 - a); 426cabdff1aSopenharmony_ci // P2 = s2u(PS2 + a); 427cabdff1aSopenharmony_ci movi v17.8h, #63 428cabdff1aSopenharmony_ci sshll v22.8h, v18.8b, #3 429cabdff1aSopenharmony_ci sshll2 v23.8h, v18.16b, #3 430cabdff1aSopenharmony_ci saddw v22.8h, v22.8h, v18.8b 431cabdff1aSopenharmony_ci saddw2 v23.8h, v23.8h, v18.16b 432cabdff1aSopenharmony_ci add v16.8h, v17.8h, v22.8h 433cabdff1aSopenharmony_ci add v17.8h, v17.8h, v23.8h // 9*w + 63 434cabdff1aSopenharmony_ci add v19.8h, v16.8h, v22.8h 435cabdff1aSopenharmony_ci add v20.8h, v17.8h, v23.8h // 18*w + 63 436cabdff1aSopenharmony_ci add v22.8h, v19.8h, v22.8h 437cabdff1aSopenharmony_ci add v23.8h, v20.8h, v23.8h // 27*w + 63 438cabdff1aSopenharmony_ci sqshrn v16.8b, v16.8h, #7 439cabdff1aSopenharmony_ci sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) 440cabdff1aSopenharmony_ci sqshrn v19.8b, v19.8h, #7 441cabdff1aSopenharmony_ci sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) 442cabdff1aSopenharmony_ci sqshrn v22.8b, v22.8h, #7 443cabdff1aSopenharmony_ci sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) 444cabdff1aSopenharmony_ci sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) 445cabdff1aSopenharmony_ci sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) 446cabdff1aSopenharmony_ci sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) 447cabdff1aSopenharmony_ci sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) 448cabdff1aSopenharmony_ci sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) 449cabdff1aSopenharmony_ci sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) 450cabdff1aSopenharmony_ci eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 451cabdff1aSopenharmony_ci eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 452cabdff1aSopenharmony_ci eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 453cabdff1aSopenharmony_ci eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 454cabdff1aSopenharmony_ci eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 455cabdff1aSopenharmony_ci eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 456cabdff1aSopenharmony_ci .endif 457cabdff1aSopenharmony_ci.endm 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ci.macro vp8_v_loop_filter16 name, inner=0, simple=0 460cabdff1aSopenharmony_cifunction ff_vp8_v_loop_filter16\name\()_neon, export=1 461cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1+!\simple 462cabdff1aSopenharmony_ci 463cabdff1aSopenharmony_ci // Load pixels: 464cabdff1aSopenharmony_ci .if !\simple 465cabdff1aSopenharmony_ci ld1 {v0.16b}, [x0], x1 // P3 466cabdff1aSopenharmony_ci ld1 {v1.16b}, [x0], x1 // P2 467cabdff1aSopenharmony_ci .endif 468cabdff1aSopenharmony_ci ld1 {v2.16b}, [x0], x1 // P1 469cabdff1aSopenharmony_ci ld1 {v3.16b}, [x0], x1 // P0 470cabdff1aSopenharmony_ci ld1 {v4.16b}, [x0], x1 // Q0 471cabdff1aSopenharmony_ci ld1 {v5.16b}, [x0], x1 // Q1 472cabdff1aSopenharmony_ci .if !\simple 473cabdff1aSopenharmony_ci ld1 {v6.16b}, [x0], x1 // Q2 474cabdff1aSopenharmony_ci ld1 {v7.16b}, [x0] // Q3 475cabdff1aSopenharmony_ci dup v23.16b, w3 // flim_I 476cabdff1aSopenharmony_ci .endif 477cabdff1aSopenharmony_ci dup v22.16b, w2 // flim_E 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_ci vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci // back up to P2: dst -= stride * 6 482cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 483cabdff1aSopenharmony_ci .if !\simple 484cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 485cabdff1aSopenharmony_ci 486cabdff1aSopenharmony_ci // Store pixels: 487cabdff1aSopenharmony_ci st1 {v1.16b}, [x0], x1 // P2 488cabdff1aSopenharmony_ci .endif 489cabdff1aSopenharmony_ci st1 {v2.16b}, [x0], x1 // P1 490cabdff1aSopenharmony_ci st1 {v3.16b}, [x0], x1 // P0 491cabdff1aSopenharmony_ci st1 {v4.16b}, [x0], x1 // Q0 492cabdff1aSopenharmony_ci st1 {v5.16b}, [x0], x1 // Q1 493cabdff1aSopenharmony_ci .if !\simple 494cabdff1aSopenharmony_ci st1 {v6.16b}, [x0] // Q2 495cabdff1aSopenharmony_ci .endif 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci ret 498cabdff1aSopenharmony_ciendfunc 499cabdff1aSopenharmony_ci.endm 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_civp8_v_loop_filter16 502cabdff1aSopenharmony_civp8_v_loop_filter16 _inner, inner=1 503cabdff1aSopenharmony_civp8_v_loop_filter16 _simple, simple=1 504cabdff1aSopenharmony_ci 505cabdff1aSopenharmony_ci.macro vp8_v_loop_filter8uv name, inner=0 506cabdff1aSopenharmony_cifunction ff_vp8_v_loop_filter8uv\name\()_neon, export=1 507cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #2 508cabdff1aSopenharmony_ci sub x1, x1, x2, lsl #2 509cabdff1aSopenharmony_ci // Load pixels: 510cabdff1aSopenharmony_ci ld1 {v0.d}[0], [x0], x2 // P3 511cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x1], x2 // P3 512cabdff1aSopenharmony_ci ld1 {v1.d}[0], [x0], x2 // P2 513cabdff1aSopenharmony_ci ld1 {v1.d}[1], [x1], x2 // P2 514cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x0], x2 // P1 515cabdff1aSopenharmony_ci ld1 {v2.d}[1], [x1], x2 // P1 516cabdff1aSopenharmony_ci ld1 {v3.d}[0], [x0], x2 // P0 517cabdff1aSopenharmony_ci ld1 {v3.d}[1], [x1], x2 // P0 518cabdff1aSopenharmony_ci ld1 {v4.d}[0], [x0], x2 // Q0 519cabdff1aSopenharmony_ci ld1 {v4.d}[1], [x1], x2 // Q0 520cabdff1aSopenharmony_ci ld1 {v5.d}[0], [x0], x2 // Q1 521cabdff1aSopenharmony_ci ld1 {v5.d}[1], [x1], x2 // Q1 522cabdff1aSopenharmony_ci ld1 {v6.d}[0], [x0], x2 // Q2 523cabdff1aSopenharmony_ci ld1 {v6.d}[1], [x1], x2 // Q2 524cabdff1aSopenharmony_ci ld1 {v7.d}[0], [x0] // Q3 525cabdff1aSopenharmony_ci ld1 {v7.d}[1], [x1] // Q3 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci dup v22.16b, w3 // flim_E 528cabdff1aSopenharmony_ci dup v23.16b, w4 // flim_I 529cabdff1aSopenharmony_ci 530cabdff1aSopenharmony_ci vp8_loop_filter inner=\inner, hev_thresh=w5 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci // back up to P2: u,v -= stride * 6 533cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #2 534cabdff1aSopenharmony_ci sub x1, x1, x2, lsl #2 535cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #1 536cabdff1aSopenharmony_ci sub x1, x1, x2, lsl #1 537cabdff1aSopenharmony_ci 538cabdff1aSopenharmony_ci // Store pixels: 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_ci st1 {v1.d}[0], [x0], x2 // P2 541cabdff1aSopenharmony_ci st1 {v1.d}[1], [x1], x2 // P2 542cabdff1aSopenharmony_ci st1 {v2.d}[0], [x0], x2 // P1 543cabdff1aSopenharmony_ci st1 {v2.d}[1], [x1], x2 // P1 544cabdff1aSopenharmony_ci st1 {v3.d}[0], [x0], x2 // P0 545cabdff1aSopenharmony_ci st1 {v3.d}[1], [x1], x2 // P0 546cabdff1aSopenharmony_ci st1 {v4.d}[0], [x0], x2 // Q0 547cabdff1aSopenharmony_ci st1 {v4.d}[1], [x1], x2 // Q0 548cabdff1aSopenharmony_ci st1 {v5.d}[0], [x0], x2 // Q1 549cabdff1aSopenharmony_ci st1 {v5.d}[1], [x1], x2 // Q1 550cabdff1aSopenharmony_ci st1 {v6.d}[0], [x0] // Q2 551cabdff1aSopenharmony_ci st1 {v6.d}[1], [x1] // Q2 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_ci ret 554cabdff1aSopenharmony_ciendfunc 555cabdff1aSopenharmony_ci.endm 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_civp8_v_loop_filter8uv 558cabdff1aSopenharmony_civp8_v_loop_filter8uv _inner, inner=1 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci.macro vp8_h_loop_filter16 name, inner=0, simple=0 561cabdff1aSopenharmony_cifunction ff_vp8_h_loop_filter16\name\()_neon, export=1 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci sub x0, x0, #4 564cabdff1aSopenharmony_ci // Load pixels: 565cabdff1aSopenharmony_ci ld1 {v0.d}[0], [x0], x1 566cabdff1aSopenharmony_ci ld1 {v1.d}[0], [x0], x1 567cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x0], x1 568cabdff1aSopenharmony_ci ld1 {v3.d}[0], [x0], x1 569cabdff1aSopenharmony_ci ld1 {v4.d}[0], [x0], x1 570cabdff1aSopenharmony_ci ld1 {v5.d}[0], [x0], x1 571cabdff1aSopenharmony_ci ld1 {v6.d}[0], [x0], x1 572cabdff1aSopenharmony_ci ld1 {v7.d}[0], [x0], x1 573cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x0], x1 574cabdff1aSopenharmony_ci ld1 {v1.d}[1], [x0], x1 575cabdff1aSopenharmony_ci ld1 {v2.d}[1], [x0], x1 576cabdff1aSopenharmony_ci ld1 {v3.d}[1], [x0], x1 577cabdff1aSopenharmony_ci ld1 {v4.d}[1], [x0], x1 578cabdff1aSopenharmony_ci ld1 {v5.d}[1], [x0], x1 579cabdff1aSopenharmony_ci ld1 {v6.d}[1], [x0], x1 580cabdff1aSopenharmony_ci ld1 {v7.d}[1], [x0], x1 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci dup v22.16b, w2 // flim_E 585cabdff1aSopenharmony_ci .if !\simple 586cabdff1aSopenharmony_ci dup v23.16b, w3 // flim_I 587cabdff1aSopenharmony_ci .endif 588cabdff1aSopenharmony_ci 589cabdff1aSopenharmony_ci vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 590cabdff1aSopenharmony_ci 591cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #4 // backup 16 rows 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci // Store pixels: 596cabdff1aSopenharmony_ci st1 {v0.d}[0], [x0], x1 597cabdff1aSopenharmony_ci st1 {v1.d}[0], [x0], x1 598cabdff1aSopenharmony_ci st1 {v2.d}[0], [x0], x1 599cabdff1aSopenharmony_ci st1 {v3.d}[0], [x0], x1 600cabdff1aSopenharmony_ci st1 {v4.d}[0], [x0], x1 601cabdff1aSopenharmony_ci st1 {v5.d}[0], [x0], x1 602cabdff1aSopenharmony_ci st1 {v6.d}[0], [x0], x1 603cabdff1aSopenharmony_ci st1 {v7.d}[0], [x0], x1 604cabdff1aSopenharmony_ci st1 {v0.d}[1], [x0], x1 605cabdff1aSopenharmony_ci st1 {v1.d}[1], [x0], x1 606cabdff1aSopenharmony_ci st1 {v2.d}[1], [x0], x1 607cabdff1aSopenharmony_ci st1 {v3.d}[1], [x0], x1 608cabdff1aSopenharmony_ci st1 {v4.d}[1], [x0], x1 609cabdff1aSopenharmony_ci st1 {v5.d}[1], [x0], x1 610cabdff1aSopenharmony_ci st1 {v6.d}[1], [x0], x1 611cabdff1aSopenharmony_ci st1 {v7.d}[1], [x0] 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci ret 614cabdff1aSopenharmony_ciendfunc 615cabdff1aSopenharmony_ci.endm 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_civp8_h_loop_filter16 618cabdff1aSopenharmony_civp8_h_loop_filter16 _inner, inner=1 619cabdff1aSopenharmony_civp8_h_loop_filter16 _simple, simple=1 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci.macro vp8_h_loop_filter8uv name, inner=0 622cabdff1aSopenharmony_cifunction ff_vp8_h_loop_filter8uv\name\()_neon, export=1 623cabdff1aSopenharmony_ci sub x0, x0, #4 624cabdff1aSopenharmony_ci sub x1, x1, #4 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci // Load pixels: 627cabdff1aSopenharmony_ci ld1 {v0.d}[0], [x0], x2 // load u 628cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x1], x2 // load v 629cabdff1aSopenharmony_ci ld1 {v1.d}[0], [x0], x2 630cabdff1aSopenharmony_ci ld1 {v1.d}[1], [x1], x2 631cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x0], x2 632cabdff1aSopenharmony_ci ld1 {v2.d}[1], [x1], x2 633cabdff1aSopenharmony_ci ld1 {v3.d}[0], [x0], x2 634cabdff1aSopenharmony_ci ld1 {v3.d}[1], [x1], x2 635cabdff1aSopenharmony_ci ld1 {v4.d}[0], [x0], x2 636cabdff1aSopenharmony_ci ld1 {v4.d}[1], [x1], x2 637cabdff1aSopenharmony_ci ld1 {v5.d}[0], [x0], x2 638cabdff1aSopenharmony_ci ld1 {v5.d}[1], [x1], x2 639cabdff1aSopenharmony_ci ld1 {v6.d}[0], [x0], x2 640cabdff1aSopenharmony_ci ld1 {v6.d}[1], [x1], x2 641cabdff1aSopenharmony_ci ld1 {v7.d}[0], [x0], x2 642cabdff1aSopenharmony_ci ld1 {v7.d}[1], [x1], x2 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci dup v22.16b, w3 // flim_E 647cabdff1aSopenharmony_ci dup v23.16b, w4 // flim_I 648cabdff1aSopenharmony_ci 649cabdff1aSopenharmony_ci vp8_loop_filter inner=\inner, hev_thresh=w5 650cabdff1aSopenharmony_ci 651cabdff1aSopenharmony_ci sub x0, x0, x2, lsl #3 // backup u 8 rows 652cabdff1aSopenharmony_ci sub x1, x1, x2, lsl #3 // backup v 8 rows 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_ci transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 655cabdff1aSopenharmony_ci 656cabdff1aSopenharmony_ci // Store pixels: 657cabdff1aSopenharmony_ci st1 {v0.d}[0], [x0], x2 // load u 658cabdff1aSopenharmony_ci st1 {v0.d}[1], [x1], x2 // load v 659cabdff1aSopenharmony_ci st1 {v1.d}[0], [x0], x2 660cabdff1aSopenharmony_ci st1 {v1.d}[1], [x1], x2 661cabdff1aSopenharmony_ci st1 {v2.d}[0], [x0], x2 662cabdff1aSopenharmony_ci st1 {v2.d}[1], [x1], x2 663cabdff1aSopenharmony_ci st1 {v3.d}[0], [x0], x2 664cabdff1aSopenharmony_ci st1 {v3.d}[1], [x1], x2 665cabdff1aSopenharmony_ci st1 {v4.d}[0], [x0], x2 666cabdff1aSopenharmony_ci st1 {v4.d}[1], [x1], x2 667cabdff1aSopenharmony_ci st1 {v5.d}[0], [x0], x2 668cabdff1aSopenharmony_ci st1 {v5.d}[1], [x1], x2 669cabdff1aSopenharmony_ci st1 {v6.d}[0], [x0], x2 670cabdff1aSopenharmony_ci st1 {v6.d}[1], [x1], x2 671cabdff1aSopenharmony_ci st1 {v7.d}[0], [x0] 672cabdff1aSopenharmony_ci st1 {v7.d}[1], [x1] 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci ret 675cabdff1aSopenharmony_ci 676cabdff1aSopenharmony_ciendfunc 677cabdff1aSopenharmony_ci.endm 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_civp8_h_loop_filter8uv 680cabdff1aSopenharmony_civp8_h_loop_filter8uv _inner, inner=1 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci 683cabdff1aSopenharmony_cifunction ff_put_vp8_pixels16_neon, export=1 684cabdff1aSopenharmony_ci1: 685cabdff1aSopenharmony_ci subs w4, w4, #4 686cabdff1aSopenharmony_ci ld1 {v0.16b}, [x2], x3 687cabdff1aSopenharmony_ci ld1 {v1.16b}, [x2], x3 688cabdff1aSopenharmony_ci ld1 {v2.16b}, [x2], x3 689cabdff1aSopenharmony_ci ld1 {v3.16b}, [x2], x3 690cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 691cabdff1aSopenharmony_ci st1 {v1.16b}, [x0], x1 692cabdff1aSopenharmony_ci st1 {v2.16b}, [x0], x1 693cabdff1aSopenharmony_ci st1 {v3.16b}, [x0], x1 694cabdff1aSopenharmony_ci b.gt 1b 695cabdff1aSopenharmony_ci ret 696cabdff1aSopenharmony_ciendfunc 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_cifunction ff_put_vp8_pixels8_neon, export=1 699cabdff1aSopenharmony_ci1: 700cabdff1aSopenharmony_ci subs w4, w4, #4 701cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2], x3 702cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x2], x3 703cabdff1aSopenharmony_ci ld1 {v1.8b}, [x2], x3 704cabdff1aSopenharmony_ci ld1 {v1.d}[1], [x2], x3 705cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 706cabdff1aSopenharmony_ci st1 {v0.d}[1], [x0], x1 707cabdff1aSopenharmony_ci st1 {v1.8b}, [x0], x1 708cabdff1aSopenharmony_ci st1 {v1.d}[1], [x0], x1 709cabdff1aSopenharmony_ci b.gt 1b 710cabdff1aSopenharmony_ci ret 711cabdff1aSopenharmony_ciendfunc 712cabdff1aSopenharmony_ci 713cabdff1aSopenharmony_ci/* 4/6-tap 8th-pel MC */ 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci.macro vp8_epel8_h6 d, s0, s1 716cabdff1aSopenharmony_ci ext v22.8b, \s0\().8b, \s1\().8b, #1 717cabdff1aSopenharmony_ci uxtl v18.8h, \s0\().8b 718cabdff1aSopenharmony_ci ext v23.8b, \s0\().8b, \s1\().8b, #2 719cabdff1aSopenharmony_ci uxtl v19.8h, v22.8b 720cabdff1aSopenharmony_ci ext v24.8b, \s0\().8b, \s1\().8b, #3 721cabdff1aSopenharmony_ci uxtl v21.8h, v23.8b 722cabdff1aSopenharmony_ci ext v25.8b, \s0\().8b, \s1\().8b, #4 723cabdff1aSopenharmony_ci uxtl v22.8h, v24.8b 724cabdff1aSopenharmony_ci ext v26.8b, \s0\().8b, \s1\().8b, #5 725cabdff1aSopenharmony_ci uxtl v25.8h, v25.8b 726cabdff1aSopenharmony_ci mul v21.8h, v21.8h, v0.h[2] 727cabdff1aSopenharmony_ci uxtl v26.8h, v26.8b 728cabdff1aSopenharmony_ci mul v22.8h, v22.8h, v0.h[3] 729cabdff1aSopenharmony_ci mls v21.8h, v19.8h, v0.h[1] 730cabdff1aSopenharmony_ci mls v22.8h, v25.8h, v0.h[4] 731cabdff1aSopenharmony_ci mla v21.8h, v18.8h, v0.h[0] 732cabdff1aSopenharmony_ci mla v22.8h, v26.8h, v0.h[5] 733cabdff1aSopenharmony_ci sqadd v22.8h, v21.8h, v22.8h 734cabdff1aSopenharmony_ci sqrshrun \d\().8b, v22.8h, #7 735cabdff1aSopenharmony_ci.endm 736cabdff1aSopenharmony_ci 737cabdff1aSopenharmony_ci.macro vp8_epel16_h6 d0, v0, v1 738cabdff1aSopenharmony_ci ext v22.16b, \v0\().16b, \v1\().16b, #3 739cabdff1aSopenharmony_ci ext v23.16b, \v0\().16b, \v1\().16b, #4 740cabdff1aSopenharmony_ci uxtl v19.8h, v22.8b 741cabdff1aSopenharmony_ci uxtl2 v22.8h, v22.16b 742cabdff1aSopenharmony_ci ext v3.16b, \v0\().16b, \v1\().16b, #2 743cabdff1aSopenharmony_ci uxtl v20.8h, v23.8b 744cabdff1aSopenharmony_ci uxtl2 v23.8h, v23.16b 745cabdff1aSopenharmony_ci ext v16.16b, \v0\().16b, \v1\().16b, #1 746cabdff1aSopenharmony_ci uxtl v18.8h, v3.8b 747cabdff1aSopenharmony_ci uxtl2 v3.8h, v3.16b 748cabdff1aSopenharmony_ci ext v2.16b, \v0\().16b, \v1\().16b, #5 749cabdff1aSopenharmony_ci uxtl v21.8h, v2.8b 750cabdff1aSopenharmony_ci uxtl2 v2.8h, v2.16b 751cabdff1aSopenharmony_ci uxtl v17.8h, v16.8b 752cabdff1aSopenharmony_ci uxtl2 v16.8h, v16.16b 753cabdff1aSopenharmony_ci mul v19.8h, v19.8h, v0.h[3] 754cabdff1aSopenharmony_ci mul v18.8h, v18.8h, v0.h[2] 755cabdff1aSopenharmony_ci mul v3.8h, v3.8h, v0.h[2] 756cabdff1aSopenharmony_ci mul v22.8h, v22.8h, v0.h[3] 757cabdff1aSopenharmony_ci mls v19.8h, v20.8h, v0.h[4] 758cabdff1aSopenharmony_ci uxtl v20.8h, \v0\().8b 759cabdff1aSopenharmony_ci uxtl2 v1.8h, \v0\().16b 760cabdff1aSopenharmony_ci mls v18.8h, v17.8h, v0.h[1] 761cabdff1aSopenharmony_ci mls v3.8h, v16.8h, v0.h[1] 762cabdff1aSopenharmony_ci mls v22.8h, v23.8h, v0.h[4] 763cabdff1aSopenharmony_ci mla v18.8h, v20.8h, v0.h[0] 764cabdff1aSopenharmony_ci mla v19.8h, v21.8h, v0.h[5] 765cabdff1aSopenharmony_ci mla v3.8h, v1.8h, v0.h[0] 766cabdff1aSopenharmony_ci mla v22.8h, v2.8h, v0.h[5] 767cabdff1aSopenharmony_ci sqadd v19.8h, v18.8h, v19.8h 768cabdff1aSopenharmony_ci sqadd v22.8h, v3.8h, v22.8h 769cabdff1aSopenharmony_ci sqrshrun \d0\().8b, v19.8h, #7 770cabdff1aSopenharmony_ci sqrshrun2 \d0\().16b, v22.8h, #7 771cabdff1aSopenharmony_ci.endm 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_ci.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 774cabdff1aSopenharmony_ci uxtl \s0\().8h, \s0\().8b 775cabdff1aSopenharmony_ci uxtl \s3\().8h, \s3\().8b 776cabdff1aSopenharmony_ci uxtl \s6\().8h, \s6\().8b 777cabdff1aSopenharmony_ci uxtl \s1\().8h, \s1\().8b 778cabdff1aSopenharmony_ci uxtl \s4\().8h, \s4\().8b 779cabdff1aSopenharmony_ci uxtl \s2\().8h, \s2\().8b 780cabdff1aSopenharmony_ci uxtl \s5\().8h, \s5\().8b 781cabdff1aSopenharmony_ci mul \s0\().8h, \s0\().8h, v0.h[0] 782cabdff1aSopenharmony_ci mul v31.8h , \s3\().8h, v0.h[3] 783cabdff1aSopenharmony_ci mul \s3\().8h, \s3\().8h, v0.h[2] 784cabdff1aSopenharmony_ci mul \s6\().8h, \s6\().8h, v0.h[5] 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci mls \s0\().8h, \s1\().8h, v0.h[1] 787cabdff1aSopenharmony_ci mls v31.8h , \s4\().8h, v0.h[4] 788cabdff1aSopenharmony_ci mls \s3\().8h, \s2\().8h, v0.h[1] 789cabdff1aSopenharmony_ci mls \s6\().8h, \s5\().8h, v0.h[4] 790cabdff1aSopenharmony_ci 791cabdff1aSopenharmony_ci mla \s0\().8h, \s2\().8h, v0.h[2] 792cabdff1aSopenharmony_ci mla v31.8h , \s5\().8h, v0.h[5] 793cabdff1aSopenharmony_ci mla \s3\().8h, \s1\().8h, v0.h[0] 794cabdff1aSopenharmony_ci mla \s6\().8h, \s4\().8h, v0.h[3] 795cabdff1aSopenharmony_ci sqadd v31.8h , \s0\().8h, v31.8h 796cabdff1aSopenharmony_ci sqadd \s6\().8h, \s3\().8h, \s6\().8h 797cabdff1aSopenharmony_ci sqrshrun \d0\().8b, v31.8h, #7 798cabdff1aSopenharmony_ci sqrshrun \d1\().8b, \s6\().8h, #7 799cabdff1aSopenharmony_ci.endm 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_ci.macro vp8_epel8_h4 d, v0, v1 802cabdff1aSopenharmony_ci ext v22.8b, \v0\().8b, \v1\().8b, #1 803cabdff1aSopenharmony_ci uxtl v19.8h, \v0\().8b 804cabdff1aSopenharmony_ci ext v23.8b, \v0\().8b, \v1\().8b, #2 805cabdff1aSopenharmony_ci uxtl v20.8h, v22.8b 806cabdff1aSopenharmony_ci ext v25.8b, \v0\().8b, \v1\().8b, #3 807cabdff1aSopenharmony_ci uxtl v22.8h, v23.8b 808cabdff1aSopenharmony_ci uxtl v25.8h, v25.8b 809cabdff1aSopenharmony_ci mul v20.8h, v20.8h, v0.h[2] 810cabdff1aSopenharmony_ci mul v22.8h, v22.8h, v0.h[3] 811cabdff1aSopenharmony_ci mls v20.8h, v19.8h, v0.h[1] 812cabdff1aSopenharmony_ci mls v22.8h, v25.8h, v0.h[4] 813cabdff1aSopenharmony_ci sqadd v22.8h, v20.8h, v22.8h 814cabdff1aSopenharmony_ci sqrshrun \d\().8b, v22.8h, #7 815cabdff1aSopenharmony_ci.endm 816cabdff1aSopenharmony_ci 817cabdff1aSopenharmony_ci.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4 818cabdff1aSopenharmony_ci uxtl \s0\().8h, \s0\().8b 819cabdff1aSopenharmony_ci uxtl \s1\().8h, \s1\().8b 820cabdff1aSopenharmony_ci uxtl \s2\().8h, \s2\().8b 821cabdff1aSopenharmony_ci uxtl \s3\().8h, \s3\().8b 822cabdff1aSopenharmony_ci uxtl \s4\().8h, \s4\().8b 823cabdff1aSopenharmony_ci mul v21.8h, \s1\().8h, v0.h[2] 824cabdff1aSopenharmony_ci mul v23.8h, \s2\().8h, v0.h[3] 825cabdff1aSopenharmony_ci mul \s2\().8h, \s2\().8h, v0.h[2] 826cabdff1aSopenharmony_ci mul v22.8h, \s3\().8h, v0.h[3] 827cabdff1aSopenharmony_ci mls v21.8h, \s0\().8h, v0.h[1] 828cabdff1aSopenharmony_ci mls v23.8h, \s3\().8h, v0.h[4] 829cabdff1aSopenharmony_ci mls \s2\().8h, \s1\().8h, v0.h[1] 830cabdff1aSopenharmony_ci mls v22.8h, \s4\().8h, v0.h[4] 831cabdff1aSopenharmony_ci sqadd v21.8h, v21.8h, v23.8h 832cabdff1aSopenharmony_ci sqadd \s2\().8h, \s2\().8h, v22.8h 833cabdff1aSopenharmony_ci sqrshrun \d0\().8b, v21.8h, #7 834cabdff1aSopenharmony_ci sqrshrun2 \d0\().16b, \s2\().8h, #7 835cabdff1aSopenharmony_ci.endm 836cabdff1aSopenharmony_ci 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_ci// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit 839cabdff1aSopenharmony_ci// arithmetic can be used to apply filters 840cabdff1aSopenharmony_ciconst subpel_filters, align=4 841cabdff1aSopenharmony_ci .short 0, 6, 123, 12, 1, 0, 0, 0 842cabdff1aSopenharmony_ci .short 2, 11, 108, 36, 8, 1, 0, 0 843cabdff1aSopenharmony_ci .short 0, 9, 93, 50, 6, 0, 0, 0 844cabdff1aSopenharmony_ci .short 3, 16, 77, 77, 16, 3, 0, 0 845cabdff1aSopenharmony_ci .short 0, 6, 50, 93, 9, 0, 0, 0 846cabdff1aSopenharmony_ci .short 1, 8, 36, 108, 11, 2, 0, 0 847cabdff1aSopenharmony_ci .short 0, 1, 12, 123, 6, 0, 0, 0 848cabdff1aSopenharmony_ciendconst 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_cifunction ff_put_vp8_epel16_v6_neon, export=1 851cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci sxtw x4, w4 854cabdff1aSopenharmony_ci sxtw x6, w6 855cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 856cabdff1aSopenharmony_ci add x6, x17, x6, lsl #4 // y 857cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 858cabdff1aSopenharmony_ci1: 859cabdff1aSopenharmony_ci ld1 {v1.1d - v2.1d}, [x2], x3 860cabdff1aSopenharmony_ci ld1 {v3.1d - v4.1d}, [x2], x3 861cabdff1aSopenharmony_ci ld1 {v16.1d - v17.1d}, [x2], x3 862cabdff1aSopenharmony_ci ld1 {v18.1d - v19.1d}, [x2], x3 863cabdff1aSopenharmony_ci ld1 {v20.1d - v21.1d}, [x2], x3 864cabdff1aSopenharmony_ci ld1 {v22.1d - v23.1d}, [x2], x3 865cabdff1aSopenharmony_ci ld1 {v24.1d - v25.1d}, [x2] 866cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #2 867cabdff1aSopenharmony_ci 868cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 869cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 870cabdff1aSopenharmony_ci 871cabdff1aSopenharmony_ci st1 {v1.1d - v2.1d}, [x0], x1 872cabdff1aSopenharmony_ci st1 {v3.1d - v4.1d}, [x0], x1 873cabdff1aSopenharmony_ci subs x4, x4, #2 874cabdff1aSopenharmony_ci b.ne 1b 875cabdff1aSopenharmony_ci 876cabdff1aSopenharmony_ci ret 877cabdff1aSopenharmony_ciendfunc 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_cifunction ff_put_vp8_epel16_h6_neon, export=1 880cabdff1aSopenharmony_ci sub x2, x2, #2 881cabdff1aSopenharmony_ci sxtw x5, w5 // x 882cabdff1aSopenharmony_ci 883cabdff1aSopenharmony_ci // first pass (horizontal): 884cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 885cabdff1aSopenharmony_ci add x5, x17, x5, lsl #4 // x 886cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 887cabdff1aSopenharmony_ci1: 888cabdff1aSopenharmony_ci ld1 {v1.16b, v2.16b}, [x2], x3 889cabdff1aSopenharmony_ci vp8_epel16_h6 v1, v1, v2 890cabdff1aSopenharmony_ci st1 {v1.16b}, [x0], x1 891cabdff1aSopenharmony_ci 892cabdff1aSopenharmony_ci subs w4, w4, #1 893cabdff1aSopenharmony_ci b.ne 1b 894cabdff1aSopenharmony_ci ret 895cabdff1aSopenharmony_ciendfunc 896cabdff1aSopenharmony_ci 897cabdff1aSopenharmony_ci 898cabdff1aSopenharmony_cifunction ff_put_vp8_epel16_h6v6_neon, export=1 899cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 900cabdff1aSopenharmony_ci sub x2, x2, #2 901cabdff1aSopenharmony_ci 902cabdff1aSopenharmony_ci // first pass (horizontal): 903cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 904cabdff1aSopenharmony_ci sxtw x5, w5 // x 905cabdff1aSopenharmony_ci add x16, x17, x5, lsl #4 // x 906cabdff1aSopenharmony_ci sub sp, sp, #336+16 907cabdff1aSopenharmony_ci ld1 {v0.8h}, [x16] 908cabdff1aSopenharmony_ci add x7, sp, #15 909cabdff1aSopenharmony_ci sxtw x4, w4 910cabdff1aSopenharmony_ci add x16, x4, #5 // h 911cabdff1aSopenharmony_ci bic x7, x7, #15 912cabdff1aSopenharmony_ci1: 913cabdff1aSopenharmony_ci ld1 {v1.16b, v2.16b}, [x2], x3 914cabdff1aSopenharmony_ci vp8_epel16_h6 v1, v1, v2 915cabdff1aSopenharmony_ci st1 {v1.16b}, [x7], #16 916cabdff1aSopenharmony_ci subs x16, x16, #1 917cabdff1aSopenharmony_ci b.ne 1b 918cabdff1aSopenharmony_ci 919cabdff1aSopenharmony_ci 920cabdff1aSopenharmony_ci // second pass (vertical): 921cabdff1aSopenharmony_ci sxtw x6, w6 922cabdff1aSopenharmony_ci add x6, x17, x6, lsl #4 // y 923cabdff1aSopenharmony_ci add x7, sp, #15 924cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 925cabdff1aSopenharmony_ci bic x7, x7, #15 926cabdff1aSopenharmony_ci2: 927cabdff1aSopenharmony_ci ld1 {v1.8b - v4.8b}, [x7], #32 928cabdff1aSopenharmony_ci ld1 {v16.8b - v19.8b}, [x7], #32 929cabdff1aSopenharmony_ci ld1 {v20.8b - v23.8b}, [x7], #32 930cabdff1aSopenharmony_ci ld1 {v24.8b - v25.8b}, [x7] 931cabdff1aSopenharmony_ci sub x7, x7, #64 932cabdff1aSopenharmony_ci 933cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 934cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 935cabdff1aSopenharmony_ci trn1 v1.2d, v1.2d, v2.2d 936cabdff1aSopenharmony_ci trn1 v3.2d, v3.2d, v4.2d 937cabdff1aSopenharmony_ci 938cabdff1aSopenharmony_ci st1 {v1.16b}, [x0], x1 939cabdff1aSopenharmony_ci st1 {v3.16b}, [x0], x1 940cabdff1aSopenharmony_ci subs x4, x4, #2 941cabdff1aSopenharmony_ci b.ne 2b 942cabdff1aSopenharmony_ci 943cabdff1aSopenharmony_ci add sp, sp, #336+16 944cabdff1aSopenharmony_ci ret 945cabdff1aSopenharmony_ciendfunc 946cabdff1aSopenharmony_ci 947cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_v6_neon, export=1 948cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 949cabdff1aSopenharmony_ci 950cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 951cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 952cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 953cabdff1aSopenharmony_ci1: 954cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 955cabdff1aSopenharmony_ci ld1 {v3.8b}, [x2], x3 956cabdff1aSopenharmony_ci ld1 {v4.8b}, [x2], x3 957cabdff1aSopenharmony_ci ld1 {v5.8b}, [x2], x3 958cabdff1aSopenharmony_ci ld1 {v6.8b}, [x2], x3 959cabdff1aSopenharmony_ci ld1 {v7.8b}, [x2], x3 960cabdff1aSopenharmony_ci ld1 {v28.8b}, [x2] 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #2 963cabdff1aSopenharmony_ci 964cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 967cabdff1aSopenharmony_ci st1 {v3.8b}, [x0], x1 968cabdff1aSopenharmony_ci subs w4, w4, #2 969cabdff1aSopenharmony_ci b.ne 1b 970cabdff1aSopenharmony_ci 971cabdff1aSopenharmony_ci ret 972cabdff1aSopenharmony_ciendfunc 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_h6_neon, export=1 975cabdff1aSopenharmony_ci sub x2, x2, #2 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 978cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 979cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 980cabdff1aSopenharmony_ci1: 981cabdff1aSopenharmony_ci ld1 {v2.8b, v3.8b}, [x2], x3 982cabdff1aSopenharmony_ci 983cabdff1aSopenharmony_ci vp8_epel8_h6 v2, v2, v3 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 986cabdff1aSopenharmony_ci subs w4, w4, #1 987cabdff1aSopenharmony_ci b.ne 1b 988cabdff1aSopenharmony_ci 989cabdff1aSopenharmony_ci ret 990cabdff1aSopenharmony_ciendfunc 991cabdff1aSopenharmony_ci 992cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_h6v6_neon, export=1 993cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 994cabdff1aSopenharmony_ci sub x2, x2, #2 995cabdff1aSopenharmony_ci sxtw x4, w4 996cabdff1aSopenharmony_ci 997cabdff1aSopenharmony_ci // first pass (horizontal): 998cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 999cabdff1aSopenharmony_ci sxtw x5, w5 1000cabdff1aSopenharmony_ci add x5, x17, x5, lsl #4 // x 1001cabdff1aSopenharmony_ci sub sp, sp, #168+16 1002cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1003cabdff1aSopenharmony_ci add x7, sp, #15 1004cabdff1aSopenharmony_ci add x16, x4, #5 // h 1005cabdff1aSopenharmony_ci bic x7, x7, #15 1006cabdff1aSopenharmony_ci1: 1007cabdff1aSopenharmony_ci ld1 {v1.8b, v2.8b}, [x2], x3 1008cabdff1aSopenharmony_ci 1009cabdff1aSopenharmony_ci vp8_epel8_h6 v1, v1, v2 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci st1 {v1.8b}, [x7], #8 1012cabdff1aSopenharmony_ci subs x16, x16, #1 1013cabdff1aSopenharmony_ci b.ne 1b 1014cabdff1aSopenharmony_ci 1015cabdff1aSopenharmony_ci // second pass (vertical): 1016cabdff1aSopenharmony_ci sxtw x6, w6 1017cabdff1aSopenharmony_ci add x6, x17, x6, lsl #4 // y 1018cabdff1aSopenharmony_ci add x7, sp, #15 1019cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1020cabdff1aSopenharmony_ci bic x7, x7, #15 1021cabdff1aSopenharmony_ci2: 1022cabdff1aSopenharmony_ci ld1 {v1.8b - v4.8b}, [x7], #32 1023cabdff1aSopenharmony_ci ld1 {v5.8b - v7.8b}, [x7] 1024cabdff1aSopenharmony_ci 1025cabdff1aSopenharmony_ci sub x7, x7, #16 1026cabdff1aSopenharmony_ci 1027cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci st1 {v1.8b}, [x0], x1 1030cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 1031cabdff1aSopenharmony_ci subs x4, x4, #2 1032cabdff1aSopenharmony_ci b.ne 2b 1033cabdff1aSopenharmony_ci 1034cabdff1aSopenharmony_ci add sp, sp, #168+16 1035cabdff1aSopenharmony_ci ret 1036cabdff1aSopenharmony_ciendfunc 1037cabdff1aSopenharmony_ci 1038cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_v4_neon, export=1 1039cabdff1aSopenharmony_ci sub x2, x2, x3 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1042cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1043cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1044cabdff1aSopenharmony_ci1: 1045cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1046cabdff1aSopenharmony_ci ld1 {v3.8b}, [x2], x3 1047cabdff1aSopenharmony_ci ld1 {v4.8b}, [x2], x3 1048cabdff1aSopenharmony_ci ld1 {v5.8b}, [x2], x3 1049cabdff1aSopenharmony_ci ld1 {v6.8b}, [x2] 1050cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci st1 {v2.d}[0], [x0], x1 1055cabdff1aSopenharmony_ci st1 {v2.d}[1], [x0], x1 1056cabdff1aSopenharmony_ci subs w4, w4, #2 1057cabdff1aSopenharmony_ci b.ne 1b 1058cabdff1aSopenharmony_ci 1059cabdff1aSopenharmony_ci ret 1060cabdff1aSopenharmony_ciendfunc 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_h4_neon, export=1 1063cabdff1aSopenharmony_ci sub x2, x2, #1 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1066cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1067cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1068cabdff1aSopenharmony_ci1: 1069cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x2], x3 1070cabdff1aSopenharmony_ci 1071cabdff1aSopenharmony_ci vp8_epel8_h4 v2, v2, v3 1072cabdff1aSopenharmony_ci 1073cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 1074cabdff1aSopenharmony_ci subs w4, w4, #1 1075cabdff1aSopenharmony_ci b.ne 1b 1076cabdff1aSopenharmony_ci 1077cabdff1aSopenharmony_ci ret 1078cabdff1aSopenharmony_ciendfunc 1079cabdff1aSopenharmony_ci 1080cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_h4v6_neon, export=1 1081cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1082cabdff1aSopenharmony_ci sub x2, x2, #1 1083cabdff1aSopenharmony_ci sxtw x4, w4 1084cabdff1aSopenharmony_ci 1085cabdff1aSopenharmony_ci // first pass (horizontal): 1086cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 1087cabdff1aSopenharmony_ci sxtw x5, w5 1088cabdff1aSopenharmony_ci add x5, x17, x5, lsl #4 // x 1089cabdff1aSopenharmony_ci sub sp, sp, #168+16 1090cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1091cabdff1aSopenharmony_ci add x7, sp, #15 1092cabdff1aSopenharmony_ci add x16, x4, #5 // h 1093cabdff1aSopenharmony_ci bic x7, x7, #15 1094cabdff1aSopenharmony_ci1: 1095cabdff1aSopenharmony_ci ld1 {v1.8b, v2.8b}, [x2], x3 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci vp8_epel8_h4 v1, v1, v2 1098cabdff1aSopenharmony_ci 1099cabdff1aSopenharmony_ci st1 {v1.8b}, [x7], #8 1100cabdff1aSopenharmony_ci subs x16, x16, #1 1101cabdff1aSopenharmony_ci b.ne 1b 1102cabdff1aSopenharmony_ci 1103cabdff1aSopenharmony_ci // second pass (vertical): 1104cabdff1aSopenharmony_ci sxtw x6, w6 1105cabdff1aSopenharmony_ci add x6, x17, x6, lsl #4 // y 1106cabdff1aSopenharmony_ci add x7, sp, #15 1107cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1108cabdff1aSopenharmony_ci bic x7, x7, #15 1109cabdff1aSopenharmony_ci2: 1110cabdff1aSopenharmony_ci ld1 {v1.8b - v4.8b}, [x7], #32 1111cabdff1aSopenharmony_ci ld1 {v5.8b - v7.8b}, [x7] 1112cabdff1aSopenharmony_ci 1113cabdff1aSopenharmony_ci sub x7, x7, #16 1114cabdff1aSopenharmony_ci 1115cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci st1 {v1.8b}, [x0], x1 1118cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 1119cabdff1aSopenharmony_ci subs x4, x4, #2 1120cabdff1aSopenharmony_ci b.ne 2b 1121cabdff1aSopenharmony_ci 1122cabdff1aSopenharmony_ci add sp, sp, #168+16 1123cabdff1aSopenharmony_ci ret 1124cabdff1aSopenharmony_ciendfunc 1125cabdff1aSopenharmony_ci 1126cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_h4v4_neon, export=1 1127cabdff1aSopenharmony_ci sub x2, x2, x3 1128cabdff1aSopenharmony_ci sub x2, x2, #1 1129cabdff1aSopenharmony_ci sxtw x4, w4 1130cabdff1aSopenharmony_ci 1131cabdff1aSopenharmony_ci 1132cabdff1aSopenharmony_ci // first pass (horizontal): 1133cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 1134cabdff1aSopenharmony_ci sxtw x5, w5 1135cabdff1aSopenharmony_ci add x5, x17, x5, lsl #4 // x 1136cabdff1aSopenharmony_ci sub sp, sp, #168+16 1137cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1138cabdff1aSopenharmony_ci add x7, sp, #15 1139cabdff1aSopenharmony_ci add x16, x4, #3 // h 1140cabdff1aSopenharmony_ci bic x7, x7, #15 1141cabdff1aSopenharmony_ci1: 1142cabdff1aSopenharmony_ci ld1 {v1.8b, v2.8b}, [x2], x3 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci vp8_epel8_h4 v1, v1, v2 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci st1 {v1.8b}, [x7], #8 1147cabdff1aSopenharmony_ci subs x16, x16, #1 1148cabdff1aSopenharmony_ci b.ne 1b 1149cabdff1aSopenharmony_ci 1150cabdff1aSopenharmony_ci // second pass (vertical): 1151cabdff1aSopenharmony_ci sxtw x6, w6 1152cabdff1aSopenharmony_ci add x6, x17, x6, lsl #4 // y 1153cabdff1aSopenharmony_ci add x7, sp, #15 1154cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1155cabdff1aSopenharmony_ci bic x7, x7, #15 1156cabdff1aSopenharmony_ci2: 1157cabdff1aSopenharmony_ci ld1 {v1.8b - v2.8b}, [x7], #16 1158cabdff1aSopenharmony_ci ld1 {v3.8b - v5.8b}, [x7] 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 1161cabdff1aSopenharmony_ci 1162cabdff1aSopenharmony_ci st1 {v1.d}[0], [x0], x1 1163cabdff1aSopenharmony_ci st1 {v1.d}[1], [x0], x1 1164cabdff1aSopenharmony_ci subs x4, x4, #2 1165cabdff1aSopenharmony_ci b.ne 2b 1166cabdff1aSopenharmony_ci 1167cabdff1aSopenharmony_ci add sp, sp, #168+16 1168cabdff1aSopenharmony_ci ret 1169cabdff1aSopenharmony_ciendfunc 1170cabdff1aSopenharmony_ci 1171cabdff1aSopenharmony_cifunction ff_put_vp8_epel8_h6v4_neon, export=1 1172cabdff1aSopenharmony_ci sub x2, x2, x3 1173cabdff1aSopenharmony_ci sub x2, x2, #2 1174cabdff1aSopenharmony_ci sxtw x4, w4 1175cabdff1aSopenharmony_ci 1176cabdff1aSopenharmony_ci 1177cabdff1aSopenharmony_ci // first pass (horizontal): 1178cabdff1aSopenharmony_ci movrel x17, subpel_filters, -16 1179cabdff1aSopenharmony_ci sxtw x5, w5 1180cabdff1aSopenharmony_ci add x5, x17, x5, lsl #4 // x 1181cabdff1aSopenharmony_ci sub sp, sp, #168+16 1182cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1183cabdff1aSopenharmony_ci add x7, sp, #15 1184cabdff1aSopenharmony_ci add x16, x4, #3 // h 1185cabdff1aSopenharmony_ci bic x7, x7, #15 1186cabdff1aSopenharmony_ci1: 1187cabdff1aSopenharmony_ci ld1 {v1.8b, v2.8b}, [x2], x3 1188cabdff1aSopenharmony_ci 1189cabdff1aSopenharmony_ci vp8_epel8_h6 v1, v1, v2 1190cabdff1aSopenharmony_ci 1191cabdff1aSopenharmony_ci st1 {v1.8b}, [x7], #8 1192cabdff1aSopenharmony_ci subs x16, x16, #1 1193cabdff1aSopenharmony_ci b.ne 1b 1194cabdff1aSopenharmony_ci 1195cabdff1aSopenharmony_ci // second pass (vertical): 1196cabdff1aSopenharmony_ci sxtw x6, w6 1197cabdff1aSopenharmony_ci add x6, x17, x6, lsl #4 // y 1198cabdff1aSopenharmony_ci add x7, sp, #15 1199cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1200cabdff1aSopenharmony_ci bic x7, x7, #15 1201cabdff1aSopenharmony_ci2: 1202cabdff1aSopenharmony_ci ld1 {v1.8b - v2.8b}, [x7], #16 1203cabdff1aSopenharmony_ci ld1 {v3.8b - v5.8b}, [x7] 1204cabdff1aSopenharmony_ci 1205cabdff1aSopenharmony_ci vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 1206cabdff1aSopenharmony_ci 1207cabdff1aSopenharmony_ci st1 {v1.d}[0], [x0], x1 1208cabdff1aSopenharmony_ci st1 {v1.d}[1], [x0], x1 1209cabdff1aSopenharmony_ci subs x4, x4, #2 1210cabdff1aSopenharmony_ci b.ne 2b 1211cabdff1aSopenharmony_ci 1212cabdff1aSopenharmony_ci add sp, sp, #168+16 1213cabdff1aSopenharmony_ci ret 1214cabdff1aSopenharmony_ciendfunc 1215cabdff1aSopenharmony_ci 1216cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_v6_neon, export=1 1217cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1218cabdff1aSopenharmony_ci 1219cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1220cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1221cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1222cabdff1aSopenharmony_ci1: 1223cabdff1aSopenharmony_ci ld1r {v2.2s}, [x2], x3 1224cabdff1aSopenharmony_ci ld1r {v3.2s}, [x2], x3 1225cabdff1aSopenharmony_ci ld1r {v4.2s}, [x2], x3 1226cabdff1aSopenharmony_ci ld1r {v5.2s}, [x2], x3 1227cabdff1aSopenharmony_ci ld1r {v6.2s}, [x2], x3 1228cabdff1aSopenharmony_ci ld1r {v7.2s}, [x2], x3 1229cabdff1aSopenharmony_ci ld1r {v28.2s}, [x2] 1230cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #2 1231cabdff1aSopenharmony_ci ld1 {v2.s}[1], [x2], x3 1232cabdff1aSopenharmony_ci ld1 {v3.s}[1], [x2], x3 1233cabdff1aSopenharmony_ci ld1 {v4.s}[1], [x2], x3 1234cabdff1aSopenharmony_ci ld1 {v5.s}[1], [x2], x3 1235cabdff1aSopenharmony_ci ld1 {v6.s}[1], [x2], x3 1236cabdff1aSopenharmony_ci ld1 {v7.s}[1], [x2], x3 1237cabdff1aSopenharmony_ci ld1 {v28.s}[1], [x2] 1238cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #2 1239cabdff1aSopenharmony_ci 1240cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 1241cabdff1aSopenharmony_ci 1242cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 1243cabdff1aSopenharmony_ci st1 {v3.s}[0], [x0], x1 1244cabdff1aSopenharmony_ci st1 {v2.s}[1], [x0], x1 1245cabdff1aSopenharmony_ci st1 {v3.s}[1], [x0], x1 1246cabdff1aSopenharmony_ci subs w4, w4, #4 1247cabdff1aSopenharmony_ci b.ne 1b 1248cabdff1aSopenharmony_ci 1249cabdff1aSopenharmony_ci ret 1250cabdff1aSopenharmony_ciendfunc 1251cabdff1aSopenharmony_ci 1252cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_h6_neon, export=1 1253cabdff1aSopenharmony_ci sub x2, x2, #2 1254cabdff1aSopenharmony_ci 1255cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1256cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1257cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1258cabdff1aSopenharmony_ci1: 1259cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x2], x3 1260cabdff1aSopenharmony_ci vp8_epel8_h6 v2, v2, v3 1261cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 1262cabdff1aSopenharmony_ci subs w4, w4, #1 1263cabdff1aSopenharmony_ci b.ne 1b 1264cabdff1aSopenharmony_ci 1265cabdff1aSopenharmony_ci ret 1266cabdff1aSopenharmony_ciendfunc 1267cabdff1aSopenharmony_ci 1268cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_h6v6_neon, export=1 1269cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1270cabdff1aSopenharmony_ci sub x2, x2, #2 1271cabdff1aSopenharmony_ci 1272cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1273cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1274cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_ci sub sp, sp, #52 1277cabdff1aSopenharmony_ci add w8, w4, #5 1278cabdff1aSopenharmony_ci mov x9, sp 1279cabdff1aSopenharmony_ci1: 1280cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x2], x3 1281cabdff1aSopenharmony_ci vp8_epel8_h6 v2, v2, v3 1282cabdff1aSopenharmony_ci st1 {v2.s}[0], [x9], #4 1283cabdff1aSopenharmony_ci subs w8, w8, #1 1284cabdff1aSopenharmony_ci b.ne 1b 1285cabdff1aSopenharmony_ci 1286cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1287cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1288cabdff1aSopenharmony_ci mov x9, sp 1289cabdff1aSopenharmony_ci2: 1290cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x9], #16 1291cabdff1aSopenharmony_ci ld1 {v6.8b}, [x9], #8 1292cabdff1aSopenharmony_ci ld1r {v28.2s}, [x9] 1293cabdff1aSopenharmony_ci sub x9, x9, #16 1294cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b}, [x9], #16 1295cabdff1aSopenharmony_ci ld1 {v7.8b}, [x9], #8 1296cabdff1aSopenharmony_ci ld1 {v28.s}[1], [x9] 1297cabdff1aSopenharmony_ci sub x9, x9, #16 1298cabdff1aSopenharmony_ci trn1 v1.2s, v2.2s, v4.2s 1299cabdff1aSopenharmony_ci trn2 v4.2s, v2.2s, v4.2s 1300cabdff1aSopenharmony_ci trn1 v2.2s, v3.2s, v5.2s 1301cabdff1aSopenharmony_ci trn2 v5.2s, v3.2s, v5.2s 1302cabdff1aSopenharmony_ci trn1 v3.2s, v6.2s, v7.2s 1303cabdff1aSopenharmony_ci trn2 v7.2s, v6.2s, v7.2s 1304cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 1305cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 1306cabdff1aSopenharmony_ci st1 {v3.s}[0], [x0], x1 1307cabdff1aSopenharmony_ci st1 {v2.s}[1], [x0], x1 1308cabdff1aSopenharmony_ci st1 {v3.s}[1], [x0], x1 1309cabdff1aSopenharmony_ci subs w4, w4, #4 1310cabdff1aSopenharmony_ci b.ne 2b 1311cabdff1aSopenharmony_ci 1312cabdff1aSopenharmony_ci add sp, sp, #52 1313cabdff1aSopenharmony_ci ret 1314cabdff1aSopenharmony_ciendfunc 1315cabdff1aSopenharmony_ci 1316cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_h4v6_neon, export=1 1317cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1318cabdff1aSopenharmony_ci sub x2, x2, #1 1319cabdff1aSopenharmony_ci 1320cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1321cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1322cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1323cabdff1aSopenharmony_ci 1324cabdff1aSopenharmony_ci sub sp, sp, #52 1325cabdff1aSopenharmony_ci add w8, w4, #5 1326cabdff1aSopenharmony_ci mov x9, sp 1327cabdff1aSopenharmony_ci1: 1328cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1329cabdff1aSopenharmony_ci vp8_epel8_h4 v2, v2, v2 1330cabdff1aSopenharmony_ci st1 {v2.s}[0], [x9], #4 1331cabdff1aSopenharmony_ci subs w8, w8, #1 1332cabdff1aSopenharmony_ci b.ne 1b 1333cabdff1aSopenharmony_ci 1334cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1335cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1336cabdff1aSopenharmony_ci mov x9, sp 1337cabdff1aSopenharmony_ci2: 1338cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x9], #16 1339cabdff1aSopenharmony_ci ld1 {v6.8b}, [x9], #8 1340cabdff1aSopenharmony_ci ld1r {v28.2s}, [x9] 1341cabdff1aSopenharmony_ci sub x9, x9, #16 1342cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b}, [x9], #16 1343cabdff1aSopenharmony_ci ld1 {v7.8b}, [x9], #8 1344cabdff1aSopenharmony_ci ld1 {v28.s}[1], [x9] 1345cabdff1aSopenharmony_ci sub x9, x9, #16 1346cabdff1aSopenharmony_ci trn1 v1.2s, v2.2s, v4.2s 1347cabdff1aSopenharmony_ci trn2 v4.2s, v2.2s, v4.2s 1348cabdff1aSopenharmony_ci trn1 v2.2s, v3.2s, v5.2s 1349cabdff1aSopenharmony_ci trn2 v5.2s, v3.2s, v5.2s 1350cabdff1aSopenharmony_ci trn1 v3.2s, v6.2s, v7.2s 1351cabdff1aSopenharmony_ci trn2 v7.2s, v6.2s, v7.2s 1352cabdff1aSopenharmony_ci vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 1353cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 1354cabdff1aSopenharmony_ci st1 {v3.s}[0], [x0], x1 1355cabdff1aSopenharmony_ci st1 {v2.s}[1], [x0], x1 1356cabdff1aSopenharmony_ci st1 {v3.s}[1], [x0], x1 1357cabdff1aSopenharmony_ci subs w4, w4, #4 1358cabdff1aSopenharmony_ci b.ne 2b 1359cabdff1aSopenharmony_ci 1360cabdff1aSopenharmony_ci add sp, sp, #52 1361cabdff1aSopenharmony_ci ret 1362cabdff1aSopenharmony_ciendfunc 1363cabdff1aSopenharmony_ci 1364cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_h6v4_neon, export=1 1365cabdff1aSopenharmony_ci sub x2, x2, x3 1366cabdff1aSopenharmony_ci sub x2, x2, #2 1367cabdff1aSopenharmony_ci 1368cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1369cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1370cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci sub sp, sp, #44 1373cabdff1aSopenharmony_ci add w8, w4, #3 1374cabdff1aSopenharmony_ci mov x9, sp 1375cabdff1aSopenharmony_ci1: 1376cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x2], x3 1377cabdff1aSopenharmony_ci vp8_epel8_h6 v2, v2, v3 1378cabdff1aSopenharmony_ci st1 {v2.s}[0], [x9], #4 1379cabdff1aSopenharmony_ci subs w8, w8, #1 1380cabdff1aSopenharmony_ci b.ne 1b 1381cabdff1aSopenharmony_ci 1382cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1383cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1384cabdff1aSopenharmony_ci mov x9, sp 1385cabdff1aSopenharmony_ci2: 1386cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x9], #16 1387cabdff1aSopenharmony_ci ld1r {v6.2s}, [x9] 1388cabdff1aSopenharmony_ci sub x9, x9, #8 1389cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b}, [x9], #16 1390cabdff1aSopenharmony_ci ld1 {v6.s}[1], [x9] 1391cabdff1aSopenharmony_ci sub x9, x9, #8 1392cabdff1aSopenharmony_ci trn1 v1.2s, v2.2s, v4.2s 1393cabdff1aSopenharmony_ci trn2 v4.2s, v2.2s, v4.2s 1394cabdff1aSopenharmony_ci trn1 v2.2s, v3.2s, v5.2s 1395cabdff1aSopenharmony_ci trn2 v5.2s, v3.2s, v5.2s 1396cabdff1aSopenharmony_ci vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 1397cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0], x1 1398cabdff1aSopenharmony_ci st1 {v1.s}[2], [x0], x1 1399cabdff1aSopenharmony_ci st1 {v1.s}[1], [x0], x1 1400cabdff1aSopenharmony_ci st1 {v1.s}[3], [x0], x1 1401cabdff1aSopenharmony_ci subs w4, w4, #4 1402cabdff1aSopenharmony_ci b.ne 2b 1403cabdff1aSopenharmony_ci 1404cabdff1aSopenharmony_ci add sp, sp, #44 1405cabdff1aSopenharmony_ci ret 1406cabdff1aSopenharmony_ciendfunc 1407cabdff1aSopenharmony_ci 1408cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_h4_neon, export=1 1409cabdff1aSopenharmony_ci sub x2, x2, #1 1410cabdff1aSopenharmony_ci 1411cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1412cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1413cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1414cabdff1aSopenharmony_ci1: 1415cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1416cabdff1aSopenharmony_ci vp8_epel8_h4 v2, v2, v2 1417cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 1418cabdff1aSopenharmony_ci subs w4, w4, #1 1419cabdff1aSopenharmony_ci b.ne 1b 1420cabdff1aSopenharmony_ci 1421cabdff1aSopenharmony_ci ret 1422cabdff1aSopenharmony_ciendfunc 1423cabdff1aSopenharmony_ci 1424cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_v4_neon, export=1 1425cabdff1aSopenharmony_ci sub x2, x2, x3 1426cabdff1aSopenharmony_ci 1427cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1428cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1429cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1430cabdff1aSopenharmony_ci1: 1431cabdff1aSopenharmony_ci ld1r {v2.2s}, [x2], x3 1432cabdff1aSopenharmony_ci ld1r {v3.2s}, [x2], x3 1433cabdff1aSopenharmony_ci ld1r {v4.2s}, [x2], x3 1434cabdff1aSopenharmony_ci ld1r {v5.2s}, [x2], x3 1435cabdff1aSopenharmony_ci ld1r {v6.2s}, [x2] 1436cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1437cabdff1aSopenharmony_ci ld1 {v2.s}[1], [x2], x3 1438cabdff1aSopenharmony_ci ld1 {v3.s}[1], [x2], x3 1439cabdff1aSopenharmony_ci ld1 {v4.s}[1], [x2], x3 1440cabdff1aSopenharmony_ci ld1 {v5.s}[1], [x2], x3 1441cabdff1aSopenharmony_ci ld1 {v6.s}[1], [x2] 1442cabdff1aSopenharmony_ci sub x2, x2, x3, lsl #1 1443cabdff1aSopenharmony_ci 1444cabdff1aSopenharmony_ci vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 1445cabdff1aSopenharmony_ci 1446cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 1447cabdff1aSopenharmony_ci st1 {v2.s}[2], [x0], x1 1448cabdff1aSopenharmony_ci st1 {v2.s}[1], [x0], x1 1449cabdff1aSopenharmony_ci st1 {v2.s}[3], [x0], x1 1450cabdff1aSopenharmony_ci subs w4, w4, #4 1451cabdff1aSopenharmony_ci b.ne 1b 1452cabdff1aSopenharmony_ci 1453cabdff1aSopenharmony_ci ret 1454cabdff1aSopenharmony_ciendfunc 1455cabdff1aSopenharmony_ci 1456cabdff1aSopenharmony_cifunction ff_put_vp8_epel4_h4v4_neon, export=1 1457cabdff1aSopenharmony_ci sub x2, x2, x3 1458cabdff1aSopenharmony_ci sub x2, x2, #1 1459cabdff1aSopenharmony_ci 1460cabdff1aSopenharmony_ci movrel x7, subpel_filters, -16 1461cabdff1aSopenharmony_ci add x5, x7, w5, uxtw #4 1462cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 1463cabdff1aSopenharmony_ci 1464cabdff1aSopenharmony_ci sub sp, sp, #44 1465cabdff1aSopenharmony_ci add w8, w4, #3 1466cabdff1aSopenharmony_ci mov x9, sp 1467cabdff1aSopenharmony_ci1: 1468cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1469cabdff1aSopenharmony_ci vp8_epel8_h4 v2, v2, v3 1470cabdff1aSopenharmony_ci st1 {v2.s}[0], [x9], #4 1471cabdff1aSopenharmony_ci subs w8, w8, #1 1472cabdff1aSopenharmony_ci b.ne 1b 1473cabdff1aSopenharmony_ci 1474cabdff1aSopenharmony_ci add x6, x7, w6, uxtw #4 1475cabdff1aSopenharmony_ci ld1 {v0.8h}, [x6] 1476cabdff1aSopenharmony_ci mov x9, sp 1477cabdff1aSopenharmony_ci2: 1478cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x9], #16 1479cabdff1aSopenharmony_ci ld1r {v6.2s}, [x9] 1480cabdff1aSopenharmony_ci sub x9, x9, #8 1481cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b}, [x9], #16 1482cabdff1aSopenharmony_ci ld1 {v6.s}[1], [x9] 1483cabdff1aSopenharmony_ci sub x9, x9, #8 1484cabdff1aSopenharmony_ci trn1 v1.2s, v2.2s, v4.2s 1485cabdff1aSopenharmony_ci trn2 v4.2s, v2.2s, v4.2s 1486cabdff1aSopenharmony_ci trn1 v2.2s, v3.2s, v5.2s 1487cabdff1aSopenharmony_ci trn2 v5.2s, v3.2s, v5.2s 1488cabdff1aSopenharmony_ci vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 1489cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0], x1 1490cabdff1aSopenharmony_ci st1 {v1.s}[2], [x0], x1 1491cabdff1aSopenharmony_ci st1 {v1.s}[1], [x0], x1 1492cabdff1aSopenharmony_ci st1 {v1.s}[3], [x0], x1 1493cabdff1aSopenharmony_ci subs w4, w4, #4 1494cabdff1aSopenharmony_ci b.ne 2b 1495cabdff1aSopenharmony_ci 1496cabdff1aSopenharmony_ci add sp, sp, #44 1497cabdff1aSopenharmony_ci ret 1498cabdff1aSopenharmony_ciendfunc 1499cabdff1aSopenharmony_ci 1500cabdff1aSopenharmony_ci/* Bilinear MC */ 1501cabdff1aSopenharmony_ci 1502cabdff1aSopenharmony_cifunction ff_put_vp8_bilin16_h_neon, export=1 1503cabdff1aSopenharmony_ci mov w7, #8 1504cabdff1aSopenharmony_ci dup v0.8b, w5 1505cabdff1aSopenharmony_ci sub w5, w7, w5 1506cabdff1aSopenharmony_ci dup v1.8b, w5 1507cabdff1aSopenharmony_ci1: 1508cabdff1aSopenharmony_ci subs w4, w4, #2 1509cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3 1510cabdff1aSopenharmony_ci ext v5.8b, v3.8b, v4.8b, #1 1511cabdff1aSopenharmony_ci ext v4.8b, v2.8b, v3.8b, #1 1512cabdff1aSopenharmony_ci umull v16.8h, v2.8b, v1.8b 1513cabdff1aSopenharmony_ci umlal v16.8h, v4.8b, v0.8b 1514cabdff1aSopenharmony_ci ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 1515cabdff1aSopenharmony_ci umull v6.8h, v3.8b, v1.8b 1516cabdff1aSopenharmony_ci umlal v6.8h, v5.8b, v0.8b 1517cabdff1aSopenharmony_ci ext v21.8b, v19.8b, v20.8b, #1 1518cabdff1aSopenharmony_ci ext v20.8b, v18.8b, v19.8b, #1 1519cabdff1aSopenharmony_ci umull v22.8h, v18.8b, v1.8b 1520cabdff1aSopenharmony_ci umlal v22.8h, v20.8b, v0.8b 1521cabdff1aSopenharmony_ci umull v24.8h, v19.8b, v1.8b 1522cabdff1aSopenharmony_ci umlal v24.8h, v21.8b, v0.8b 1523cabdff1aSopenharmony_ci rshrn v4.8b, v16.8h, #3 1524cabdff1aSopenharmony_ci rshrn2 v4.16b, v6.8h, #3 1525cabdff1aSopenharmony_ci rshrn v6.8b, v22.8h, #3 1526cabdff1aSopenharmony_ci rshrn2 v6.16b, v24.8h, #3 1527cabdff1aSopenharmony_ci st1 {v4.16b}, [x0], x1 1528cabdff1aSopenharmony_ci st1 {v6.16b}, [x0], x1 1529cabdff1aSopenharmony_ci b.gt 1b 1530cabdff1aSopenharmony_ci 1531cabdff1aSopenharmony_ci ret 1532cabdff1aSopenharmony_ciendfunc 1533cabdff1aSopenharmony_ci 1534cabdff1aSopenharmony_cifunction ff_put_vp8_bilin16_v_neon, export=1 1535cabdff1aSopenharmony_ci mov w7, #8 1536cabdff1aSopenharmony_ci dup v0.16b, w6 1537cabdff1aSopenharmony_ci sub w6, w7, w6 1538cabdff1aSopenharmony_ci dup v1.16b, w6 1539cabdff1aSopenharmony_ci 1540cabdff1aSopenharmony_ci ld1 {v2.16b}, [x2], x3 1541cabdff1aSopenharmony_ci1: 1542cabdff1aSopenharmony_ci subs w4, w4, #2 1543cabdff1aSopenharmony_ci ld1 {v4.16b}, [x2], x3 1544cabdff1aSopenharmony_ci umull v6.8h, v2.8b, v1.8b 1545cabdff1aSopenharmony_ci umlal v6.8h, v4.8b, v0.8b 1546cabdff1aSopenharmony_ci umull2 v16.8h, v2.16b, v1.16b 1547cabdff1aSopenharmony_ci umlal2 v16.8h, v4.16b, v0.16b 1548cabdff1aSopenharmony_ci ld1 {v2.16b}, [x2], x3 1549cabdff1aSopenharmony_ci umull v18.8h, v4.8b, v1.8b 1550cabdff1aSopenharmony_ci umlal v18.8h, v2.8b, v0.8b 1551cabdff1aSopenharmony_ci umull2 v20.8h, v4.16b, v1.16b 1552cabdff1aSopenharmony_ci umlal2 v20.8h, v2.16b, v0.16b 1553cabdff1aSopenharmony_ci rshrn v4.8b, v6.8h, #3 1554cabdff1aSopenharmony_ci rshrn2 v4.16b, v16.8h, #3 1555cabdff1aSopenharmony_ci rshrn v6.8b, v18.8h, #3 1556cabdff1aSopenharmony_ci rshrn2 v6.16b, v20.8h, #3 1557cabdff1aSopenharmony_ci st1 {v4.16b}, [x0], x1 1558cabdff1aSopenharmony_ci st1 {v6.16b}, [x0], x1 1559cabdff1aSopenharmony_ci b.gt 1b 1560cabdff1aSopenharmony_ci 1561cabdff1aSopenharmony_ci ret 1562cabdff1aSopenharmony_ciendfunc 1563cabdff1aSopenharmony_ci 1564cabdff1aSopenharmony_cifunction ff_put_vp8_bilin16_hv_neon, export=1 1565cabdff1aSopenharmony_ci mov w7, #8 1566cabdff1aSopenharmony_ci dup v0.8b, w5 // mx 1567cabdff1aSopenharmony_ci sub w5, w7, w5 1568cabdff1aSopenharmony_ci dup v1.8b, w5 1569cabdff1aSopenharmony_ci dup v2.16b, w6 // my 1570cabdff1aSopenharmony_ci sub w6, w7, w6 1571cabdff1aSopenharmony_ci dup v3.16b, w6 1572cabdff1aSopenharmony_ci 1573cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3 1574cabdff1aSopenharmony_ci 1575cabdff1aSopenharmony_ci ext v7.8b, v5.8b, v6.8b, #1 1576cabdff1aSopenharmony_ci ext v6.8b, v4.8b, v5.8b, #1 1577cabdff1aSopenharmony_ci umull v16.8h, v4.8b, v1.8b 1578cabdff1aSopenharmony_ci umlal v16.8h, v6.8b, v0.8b 1579cabdff1aSopenharmony_ci umull v18.8h, v5.8b, v1.8b 1580cabdff1aSopenharmony_ci umlal v18.8h, v7.8b, v0.8b 1581cabdff1aSopenharmony_ci rshrn v4.8b, v16.8h, #3 1582cabdff1aSopenharmony_ci rshrn2 v4.16b, v18.8h, #3 1583cabdff1aSopenharmony_ci1: 1584cabdff1aSopenharmony_ci subs w4, w4, #2 1585cabdff1aSopenharmony_ci ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 1586cabdff1aSopenharmony_ci ext v21.8b, v19.8b, v20.8b, #1 1587cabdff1aSopenharmony_ci ext v20.8b, v18.8b, v19.8b, #1 1588cabdff1aSopenharmony_ci umull v22.8h, v18.8b, v1.8b 1589cabdff1aSopenharmony_ci umlal v22.8h, v20.8b, v0.8b 1590cabdff1aSopenharmony_ci ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3 1591cabdff1aSopenharmony_ci umull v24.8h, v19.8b, v1.8b 1592cabdff1aSopenharmony_ci umlal v24.8h, v21.8b, v0.8b 1593cabdff1aSopenharmony_ci ext v29.8b, v27.8b, v28.8b, #1 1594cabdff1aSopenharmony_ci ext v28.8b, v26.8b, v27.8b, #1 1595cabdff1aSopenharmony_ci umull v16.8h, v26.8b, v1.8b 1596cabdff1aSopenharmony_ci umlal v16.8h, v28.8b, v0.8b 1597cabdff1aSopenharmony_ci umull v18.8h, v27.8b, v1.8b 1598cabdff1aSopenharmony_ci umlal v18.8h, v29.8b, v0.8b 1599cabdff1aSopenharmony_ci rshrn v6.8b, v22.8h, #3 1600cabdff1aSopenharmony_ci rshrn2 v6.16b, v24.8h, #3 1601cabdff1aSopenharmony_ci umull v24.8h, v4.8b, v3.8b 1602cabdff1aSopenharmony_ci umlal v24.8h, v6.8b, v2.8b 1603cabdff1aSopenharmony_ci umull2 v30.8h, v4.16b, v3.16b 1604cabdff1aSopenharmony_ci umlal2 v30.8h, v6.16b, v2.16b 1605cabdff1aSopenharmony_ci rshrn v4.8b, v16.8h, #3 1606cabdff1aSopenharmony_ci rshrn2 v4.16b, v18.8h, #3 1607cabdff1aSopenharmony_ci umull v20.8h, v6.8b, v3.8b 1608cabdff1aSopenharmony_ci umlal v20.8h, v4.8b, v2.8b 1609cabdff1aSopenharmony_ci umull2 v22.8h, v6.16b, v3.16b 1610cabdff1aSopenharmony_ci umlal2 v22.8h, v4.16b, v2.16b 1611cabdff1aSopenharmony_ci rshrn v24.8b, v24.8h, #3 1612cabdff1aSopenharmony_ci rshrn2 v24.16b, v30.8h, #3 1613cabdff1aSopenharmony_ci st1 {v24.16b}, [x0], x1 1614cabdff1aSopenharmony_ci rshrn v20.8b, v20.8h, #3 1615cabdff1aSopenharmony_ci rshrn2 v20.16b, v22.8h, #3 1616cabdff1aSopenharmony_ci st1 {v20.16b}, [x0], x1 1617cabdff1aSopenharmony_ci b.gt 1b 1618cabdff1aSopenharmony_ci 1619cabdff1aSopenharmony_ci ret 1620cabdff1aSopenharmony_ciendfunc 1621cabdff1aSopenharmony_ci 1622cabdff1aSopenharmony_cifunction ff_put_vp8_bilin8_h_neon, export=1 1623cabdff1aSopenharmony_ci mov w7, #8 1624cabdff1aSopenharmony_ci dup v0.8b, w5 1625cabdff1aSopenharmony_ci sub w5, w7, w5 1626cabdff1aSopenharmony_ci dup v1.8b, w5 1627cabdff1aSopenharmony_ci1: 1628cabdff1aSopenharmony_ci subs w4, w4, #2 1629cabdff1aSopenharmony_ci ld1 {v2.8b,v3.8b}, [x2], x3 1630cabdff1aSopenharmony_ci ext v3.8b, v2.8b, v3.8b, #1 1631cabdff1aSopenharmony_ci umull v4.8h, v2.8b, v1.8b 1632cabdff1aSopenharmony_ci umlal v4.8h, v3.8b, v0.8b 1633cabdff1aSopenharmony_ci ld1 {v6.8b,v7.8b}, [x2], x3 1634cabdff1aSopenharmony_ci ext v7.8b, v6.8b, v7.8b, #1 1635cabdff1aSopenharmony_ci umull v16.8h, v6.8b, v1.8b 1636cabdff1aSopenharmony_ci umlal v16.8h, v7.8b, v0.8b 1637cabdff1aSopenharmony_ci rshrn v4.8b, v4.8h, #3 1638cabdff1aSopenharmony_ci rshrn v16.8b, v16.8h, #3 1639cabdff1aSopenharmony_ci st1 {v4.8b}, [x0], x1 1640cabdff1aSopenharmony_ci st1 {v16.8b}, [x0], x1 1641cabdff1aSopenharmony_ci b.gt 1b 1642cabdff1aSopenharmony_ci 1643cabdff1aSopenharmony_ci ret 1644cabdff1aSopenharmony_ciendfunc 1645cabdff1aSopenharmony_ci 1646cabdff1aSopenharmony_cifunction ff_put_vp8_bilin8_v_neon, export=1 1647cabdff1aSopenharmony_ci mov w7, #8 1648cabdff1aSopenharmony_ci dup v0.8b, w6 1649cabdff1aSopenharmony_ci sub w6, w7, w6 1650cabdff1aSopenharmony_ci dup v1.8b, w6 1651cabdff1aSopenharmony_ci 1652cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1653cabdff1aSopenharmony_ci1: 1654cabdff1aSopenharmony_ci subs w4, w4, #2 1655cabdff1aSopenharmony_ci ld1 {v3.8b}, [x2], x3 1656cabdff1aSopenharmony_ci umull v4.8h, v2.8b, v1.8b 1657cabdff1aSopenharmony_ci umlal v4.8h, v3.8b, v0.8b 1658cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1659cabdff1aSopenharmony_ci umull v6.8h, v3.8b, v1.8b 1660cabdff1aSopenharmony_ci umlal v6.8h, v2.8b, v0.8b 1661cabdff1aSopenharmony_ci rshrn v4.8b, v4.8h, #3 1662cabdff1aSopenharmony_ci rshrn v6.8b, v6.8h, #3 1663cabdff1aSopenharmony_ci st1 {v4.8b}, [x0], x1 1664cabdff1aSopenharmony_ci st1 {v6.8b}, [x0], x1 1665cabdff1aSopenharmony_ci b.gt 1b 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_ci ret 1668cabdff1aSopenharmony_ciendfunc 1669cabdff1aSopenharmony_ci 1670cabdff1aSopenharmony_cifunction ff_put_vp8_bilin8_hv_neon, export=1 1671cabdff1aSopenharmony_ci mov w7, #8 1672cabdff1aSopenharmony_ci dup v0.8b, w5 // mx 1673cabdff1aSopenharmony_ci sub w5, w7, w5 1674cabdff1aSopenharmony_ci dup v1.8b, w5 1675cabdff1aSopenharmony_ci dup v2.8b, w6 // my 1676cabdff1aSopenharmony_ci sub w6, w7, w6 1677cabdff1aSopenharmony_ci dup v3.8b, w6 1678cabdff1aSopenharmony_ci 1679cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b}, [x2], x3 1680cabdff1aSopenharmony_ci ext v5.8b, v4.8b, v5.8b, #1 1681cabdff1aSopenharmony_ci umull v18.8h, v4.8b, v1.8b 1682cabdff1aSopenharmony_ci umlal v18.8h, v5.8b, v0.8b 1683cabdff1aSopenharmony_ci rshrn v22.8b, v18.8h, #3 1684cabdff1aSopenharmony_ci1: 1685cabdff1aSopenharmony_ci subs w4, w4, #2 1686cabdff1aSopenharmony_ci ld1 {v6.8b,v7.8b}, [x2], x3 1687cabdff1aSopenharmony_ci ext v7.8b, v6.8b, v7.8b, #1 1688cabdff1aSopenharmony_ci umull v16.8h, v6.8b, v1.8b 1689cabdff1aSopenharmony_ci umlal v16.8h, v7.8b, v0.8b 1690cabdff1aSopenharmony_ci ld1 {v4.8b,v5.8b}, [x2], x3 1691cabdff1aSopenharmony_ci ext v5.8b, v4.8b, v5.8b, #1 1692cabdff1aSopenharmony_ci umull v18.8h, v4.8b, v1.8b 1693cabdff1aSopenharmony_ci umlal v18.8h, v5.8b, v0.8b 1694cabdff1aSopenharmony_ci rshrn v16.8b, v16.8h, #3 1695cabdff1aSopenharmony_ci umull v20.8h, v22.8b, v3.8b 1696cabdff1aSopenharmony_ci umlal v20.8h, v16.8b, v2.8b 1697cabdff1aSopenharmony_ci rshrn v22.8b, v18.8h, #3 1698cabdff1aSopenharmony_ci umull v24.8h, v16.8b, v3.8b 1699cabdff1aSopenharmony_ci umlal v24.8h, v22.8b, v2.8b 1700cabdff1aSopenharmony_ci rshrn v20.8b, v20.8h, #3 1701cabdff1aSopenharmony_ci st1 {v20.8b}, [x0], x1 1702cabdff1aSopenharmony_ci rshrn v23.8b, v24.8h, #3 1703cabdff1aSopenharmony_ci st1 {v23.8b}, [x0], x1 1704cabdff1aSopenharmony_ci b.gt 1b 1705cabdff1aSopenharmony_ci 1706cabdff1aSopenharmony_ci ret 1707cabdff1aSopenharmony_ciendfunc 1708cabdff1aSopenharmony_ci 1709cabdff1aSopenharmony_cifunction ff_put_vp8_bilin4_h_neon, export=1 1710cabdff1aSopenharmony_ci mov w7, #8 1711cabdff1aSopenharmony_ci dup v0.8b, w5 1712cabdff1aSopenharmony_ci sub w5, w7, w5 1713cabdff1aSopenharmony_ci dup v1.8b, w5 1714cabdff1aSopenharmony_ci1: 1715cabdff1aSopenharmony_ci subs w4, w4, #2 1716cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x3 1717cabdff1aSopenharmony_ci ext v3.8b, v2.8b, v3.8b, #1 1718cabdff1aSopenharmony_ci ld1 {v6.8b}, [x2], x3 1719cabdff1aSopenharmony_ci ext v7.8b, v6.8b, v7.8b, #1 1720cabdff1aSopenharmony_ci trn1 v2.2s, v2.2s, v6.2s 1721cabdff1aSopenharmony_ci trn1 v3.2s, v3.2s, v7.2s 1722cabdff1aSopenharmony_ci umull v4.8h, v2.8b, v1.8b 1723cabdff1aSopenharmony_ci umlal v4.8h, v3.8b, v0.8b 1724cabdff1aSopenharmony_ci rshrn v4.8b, v4.8h, #3 1725cabdff1aSopenharmony_ci st1 {v4.s}[0], [x0], x1 1726cabdff1aSopenharmony_ci st1 {v4.s}[1], [x0], x1 1727cabdff1aSopenharmony_ci b.gt 1b 1728cabdff1aSopenharmony_ci 1729cabdff1aSopenharmony_ci ret 1730cabdff1aSopenharmony_ciendfunc 1731cabdff1aSopenharmony_ci 1732cabdff1aSopenharmony_cifunction ff_put_vp8_bilin4_v_neon, export=1 1733cabdff1aSopenharmony_ci mov w7, #8 1734cabdff1aSopenharmony_ci dup v0.8b, w6 1735cabdff1aSopenharmony_ci sub w6, w7, w6 1736cabdff1aSopenharmony_ci dup v1.8b, w6 1737cabdff1aSopenharmony_ci 1738cabdff1aSopenharmony_ci ld1r {v2.2s}, [x2], x3 1739cabdff1aSopenharmony_ci1: 1740cabdff1aSopenharmony_ci ld1r {v3.2s}, [x2] 1741cabdff1aSopenharmony_ci ld1 {v2.s}[1], [x2], x3 1742cabdff1aSopenharmony_ci ld1 {v3.s}[1], [x2], x3 1743cabdff1aSopenharmony_ci umull v4.8h, v2.8b, v1.8b 1744cabdff1aSopenharmony_ci umlal v4.8h, v3.8b, v0.8b 1745cabdff1aSopenharmony_ci trn2 v2.2s, v3.2s, v2.2s 1746cabdff1aSopenharmony_ci rshrn v4.8b, v4.8h, #3 1747cabdff1aSopenharmony_ci st1 {v4.s}[0], [x0], x1 1748cabdff1aSopenharmony_ci st1 {v4.s}[1], [x0], x1 1749cabdff1aSopenharmony_ci subs w4, w4, #2 1750cabdff1aSopenharmony_ci b.gt 1b 1751cabdff1aSopenharmony_ci 1752cabdff1aSopenharmony_ci ret 1753cabdff1aSopenharmony_ciendfunc 1754cabdff1aSopenharmony_ci 1755cabdff1aSopenharmony_cifunction ff_put_vp8_bilin4_hv_neon, export=1 1756cabdff1aSopenharmony_ci mov w7, #8 1757cabdff1aSopenharmony_ci dup v0.8b, w5 // mx 1758cabdff1aSopenharmony_ci sub w5, w7, w5 1759cabdff1aSopenharmony_ci dup v1.8b, w5 1760cabdff1aSopenharmony_ci dup v2.8b, w6 // my 1761cabdff1aSopenharmony_ci sub w6, w7, w6 1762cabdff1aSopenharmony_ci dup v3.8b, w6 1763cabdff1aSopenharmony_ci 1764cabdff1aSopenharmony_ci ld1 {v4.8b}, [x2], x3 1765cabdff1aSopenharmony_ci ext v5.8b, v4.8b, v4.8b, #1 1766cabdff1aSopenharmony_ci umull v18.8h, v4.8b, v1.8b 1767cabdff1aSopenharmony_ci umlal v18.8h, v5.8b, v0.8b 1768cabdff1aSopenharmony_ci rshrn v22.8b, v18.8h, #3 1769cabdff1aSopenharmony_ci1: 1770cabdff1aSopenharmony_ci subs w4, w4, #2 1771cabdff1aSopenharmony_ci ld1 {v6.8b}, [x2], x3 1772cabdff1aSopenharmony_ci ext v7.8b, v6.8b, v6.8b, #1 1773cabdff1aSopenharmony_ci ld1 {v4.8b}, [x2], x3 1774cabdff1aSopenharmony_ci ext v5.8b, v4.8b, v4.8b, #1 1775cabdff1aSopenharmony_ci trn1 v6.2s, v6.2s, v4.2s 1776cabdff1aSopenharmony_ci trn1 v7.2s, v7.2s, v5.2s 1777cabdff1aSopenharmony_ci umull v16.8h, v6.8b, v1.8b 1778cabdff1aSopenharmony_ci umlal v16.8h, v7.8b, v0.8b 1779cabdff1aSopenharmony_ci rshrn v16.8b, v16.8h, #3 1780cabdff1aSopenharmony_ci umull v20.8h, v16.8b, v2.8b 1781cabdff1aSopenharmony_ci trn1 v22.2s, v22.2s, v16.2s 1782cabdff1aSopenharmony_ci umlal v20.8h, v22.8b, v3.8b 1783cabdff1aSopenharmony_ci rev64 v22.2s, v16.2s 1784cabdff1aSopenharmony_ci rshrn v20.8b, v20.8h, #3 1785cabdff1aSopenharmony_ci st1 {v20.s}[0], [x0], x1 1786cabdff1aSopenharmony_ci st1 {v20.s}[1], [x0], x1 1787cabdff1aSopenharmony_ci b.gt 1b 1788cabdff1aSopenharmony_ci 1789cabdff1aSopenharmony_ci ret 1790cabdff1aSopenharmony_ciendfunc 1791