1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * VC1 AArch64 NEON optimisations 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci// VC-1 8x8 inverse transform 26cabdff1aSopenharmony_ci// On entry: 27cabdff1aSopenharmony_ci// x0 -> array of 16-bit inverse transform coefficients, in column-major order 28cabdff1aSopenharmony_ci// On exit: 29cabdff1aSopenharmony_ci// array at x0 updated to hold transformed block; also now held in row-major order 30cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x8_neon, export=1 31cabdff1aSopenharmony_ci ld1 {v1.16b, v2.16b}, [x0], #32 32cabdff1aSopenharmony_ci ld1 {v3.16b, v4.16b}, [x0], #32 33cabdff1aSopenharmony_ci ld1 {v5.16b, v6.16b}, [x0], #32 34cabdff1aSopenharmony_ci shl v1.8h, v1.8h, #2 // 8/2 * src[0] 35cabdff1aSopenharmony_ci sub x1, x0, #3*32 36cabdff1aSopenharmony_ci ld1 {v16.16b, v17.16b}, [x0] 37cabdff1aSopenharmony_ci shl v7.8h, v2.8h, #4 // 16 * src[8] 38cabdff1aSopenharmony_ci shl v18.8h, v2.8h, #2 // 4 * src[8] 39cabdff1aSopenharmony_ci shl v19.8h, v4.8h, #4 // 16 * src[24] 40cabdff1aSopenharmony_ci ldr d0, .Lcoeffs_it8 41cabdff1aSopenharmony_ci shl v5.8h, v5.8h, #2 // 8/2 * src[32] 42cabdff1aSopenharmony_ci shl v20.8h, v6.8h, #4 // 16 * src[40] 43cabdff1aSopenharmony_ci shl v21.8h, v6.8h, #2 // 4 * src[40] 44cabdff1aSopenharmony_ci shl v22.8h, v17.8h, #4 // 16 * src[56] 45cabdff1aSopenharmony_ci ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] 46cabdff1aSopenharmony_ci mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] 47cabdff1aSopenharmony_ci sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] 48cabdff1aSopenharmony_ci ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] 49cabdff1aSopenharmony_ci sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] 50cabdff1aSopenharmony_ci shl v3.8h, v3.8h, #3 // 16/2 * src[16] 51cabdff1aSopenharmony_ci mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] 52cabdff1aSopenharmony_ci ssra v1.8h, v1.8h, #1 // 12/2 * src[0] 53cabdff1aSopenharmony_ci ssra v5.8h, v5.8h, #1 // 12/2 * src[32] 54cabdff1aSopenharmony_ci mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] 55cabdff1aSopenharmony_ci shl v21.8h, v16.8h, #3 // 16/2 * src[48] 56cabdff1aSopenharmony_ci mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] 57cabdff1aSopenharmony_ci sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] 58cabdff1aSopenharmony_ci mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] 59cabdff1aSopenharmony_ci add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] 60cabdff1aSopenharmony_ci sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] 61cabdff1aSopenharmony_ci mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] 62cabdff1aSopenharmony_ci mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] 63cabdff1aSopenharmony_ci add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 64cabdff1aSopenharmony_ci sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 65cabdff1aSopenharmony_ci mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] 66cabdff1aSopenharmony_ci add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 67cabdff1aSopenharmony_ci add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 68cabdff1aSopenharmony_ci mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] 69cabdff1aSopenharmony_ci sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 70cabdff1aSopenharmony_ci add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 71cabdff1aSopenharmony_ci mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] 72cabdff1aSopenharmony_ci sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 73cabdff1aSopenharmony_ci sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 74cabdff1aSopenharmony_ci neg v3.8h, v7.8h // -t1 75cabdff1aSopenharmony_ci neg v4.8h, v20.8h // +t2 76cabdff1aSopenharmony_ci neg v6.8h, v19.8h // +t3 77cabdff1aSopenharmony_ci ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 78cabdff1aSopenharmony_ci ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 79cabdff1aSopenharmony_ci neg v7.8h, v18.8h // +t4 80cabdff1aSopenharmony_ci ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 81cabdff1aSopenharmony_ci ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 82cabdff1aSopenharmony_ci ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 83cabdff1aSopenharmony_ci ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 84cabdff1aSopenharmony_ci ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 85cabdff1aSopenharmony_ci ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 86cabdff1aSopenharmony_ci srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 87cabdff1aSopenharmony_ci srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 88cabdff1aSopenharmony_ci srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 89cabdff1aSopenharmony_ci srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 90cabdff1aSopenharmony_ci srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 91cabdff1aSopenharmony_ci srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 92cabdff1aSopenharmony_ci srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 93cabdff1aSopenharmony_ci srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 94cabdff1aSopenharmony_ci trn2 v17.8h, v3.8h, v4.8h 95cabdff1aSopenharmony_ci trn2 v18.8h, v5.8h, v6.8h 96cabdff1aSopenharmony_ci trn2 v19.8h, v2.8h, v1.8h 97cabdff1aSopenharmony_ci trn2 v20.8h, v7.8h, v16.8h 98cabdff1aSopenharmony_ci trn1 v21.4s, v17.4s, v18.4s 99cabdff1aSopenharmony_ci trn2 v17.4s, v17.4s, v18.4s 100cabdff1aSopenharmony_ci trn1 v18.4s, v19.4s, v20.4s 101cabdff1aSopenharmony_ci trn2 v19.4s, v19.4s, v20.4s 102cabdff1aSopenharmony_ci trn1 v3.8h, v3.8h, v4.8h 103cabdff1aSopenharmony_ci trn2 v4.2d, v21.2d, v18.2d 104cabdff1aSopenharmony_ci trn1 v20.2d, v17.2d, v19.2d 105cabdff1aSopenharmony_ci trn1 v5.8h, v5.8h, v6.8h 106cabdff1aSopenharmony_ci trn1 v1.8h, v2.8h, v1.8h 107cabdff1aSopenharmony_ci trn1 v2.8h, v7.8h, v16.8h 108cabdff1aSopenharmony_ci trn1 v6.2d, v21.2d, v18.2d 109cabdff1aSopenharmony_ci trn2 v7.2d, v17.2d, v19.2d 110cabdff1aSopenharmony_ci shl v16.8h, v20.8h, #4 // 16 * src[24] 111cabdff1aSopenharmony_ci shl v17.8h, v4.8h, #4 // 16 * src[40] 112cabdff1aSopenharmony_ci trn1 v18.4s, v3.4s, v5.4s 113cabdff1aSopenharmony_ci trn1 v19.4s, v1.4s, v2.4s 114cabdff1aSopenharmony_ci shl v21.8h, v7.8h, #4 // 16 * src[56] 115cabdff1aSopenharmony_ci shl v22.8h, v6.8h, #2 // 4 * src[8] 116cabdff1aSopenharmony_ci shl v23.8h, v4.8h, #2 // 4 * src[40] 117cabdff1aSopenharmony_ci trn2 v3.4s, v3.4s, v5.4s 118cabdff1aSopenharmony_ci trn2 v1.4s, v1.4s, v2.4s 119cabdff1aSopenharmony_ci shl v2.8h, v6.8h, #4 // 16 * src[8] 120cabdff1aSopenharmony_ci sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] 121cabdff1aSopenharmony_ci ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] 122cabdff1aSopenharmony_ci sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] 123cabdff1aSopenharmony_ci trn1 v22.2d, v18.2d, v19.2d 124cabdff1aSopenharmony_ci trn2 v18.2d, v18.2d, v19.2d 125cabdff1aSopenharmony_ci trn1 v19.2d, v3.2d, v1.2d 126cabdff1aSopenharmony_ci ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] 127cabdff1aSopenharmony_ci mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] 128cabdff1aSopenharmony_ci shl v21.8h, v22.8h, #2 // 8/2 * src[0] 129cabdff1aSopenharmony_ci shl v18.8h, v18.8h, #2 // 8/2 * src[32] 130cabdff1aSopenharmony_ci mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] 131cabdff1aSopenharmony_ci shl v6.8h, v19.8h, #3 // 16/2 * src[16] 132cabdff1aSopenharmony_ci trn2 v1.2d, v3.2d, v1.2d 133cabdff1aSopenharmony_ci mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] 134cabdff1aSopenharmony_ci ssra v21.8h, v21.8h, #1 // 12/2 * src[0] 135cabdff1aSopenharmony_ci ssra v18.8h, v18.8h, #1 // 12/2 * src[32] 136cabdff1aSopenharmony_ci mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] 137cabdff1aSopenharmony_ci shl v19.8h, v1.8h, #3 // 16/2 * src[48] 138cabdff1aSopenharmony_ci mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] 139cabdff1aSopenharmony_ci add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] 140cabdff1aSopenharmony_ci mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] 141cabdff1aSopenharmony_ci sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] 142cabdff1aSopenharmony_ci sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] 143cabdff1aSopenharmony_ci mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] 144cabdff1aSopenharmony_ci mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] 145cabdff1aSopenharmony_ci add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 146cabdff1aSopenharmony_ci add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 147cabdff1aSopenharmony_ci mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] 148cabdff1aSopenharmony_ci sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 149cabdff1aSopenharmony_ci neg v21.8h, v17.8h // +t2 150cabdff1aSopenharmony_ci mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] 151cabdff1aSopenharmony_ci sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 152cabdff1aSopenharmony_ci neg v4.8h, v5.8h // +t3 153cabdff1aSopenharmony_ci sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 154cabdff1aSopenharmony_ci sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 155cabdff1aSopenharmony_ci neg v24.8h, v16.8h // +t4 156cabdff1aSopenharmony_ci add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 157cabdff1aSopenharmony_ci add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 158cabdff1aSopenharmony_ci ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 159cabdff1aSopenharmony_ci neg v3.8h, v2.8h // -t1 160cabdff1aSopenharmony_ci ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 161cabdff1aSopenharmony_ci ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 162cabdff1aSopenharmony_ci ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 163cabdff1aSopenharmony_ci srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 164cabdff1aSopenharmony_ci srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 165cabdff1aSopenharmony_ci srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 166cabdff1aSopenharmony_ci srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 167cabdff1aSopenharmony_ci srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 168cabdff1aSopenharmony_ci srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 169cabdff1aSopenharmony_ci srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 170cabdff1aSopenharmony_ci srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 171cabdff1aSopenharmony_ci srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 172cabdff1aSopenharmony_ci srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 173cabdff1aSopenharmony_ci st1 {v2.16b, v3.16b}, [x1], #32 174cabdff1aSopenharmony_ci srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 175cabdff1aSopenharmony_ci srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 176cabdff1aSopenharmony_ci st1 {v4.16b, v5.16b}, [x1], #32 177cabdff1aSopenharmony_ci st1 {v16.16b, v17.16b}, [x1], #32 178cabdff1aSopenharmony_ci st1 {v0.16b, v1.16b}, [x1] 179cabdff1aSopenharmony_ci ret 180cabdff1aSopenharmony_ciendfunc 181cabdff1aSopenharmony_ci 182cabdff1aSopenharmony_ci// VC-1 8x4 inverse transform 183cabdff1aSopenharmony_ci// On entry: 184cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 185cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 186cabdff1aSopenharmony_ci// x2 -> array of 16-bit inverse transform coefficients, in row-major order 187cabdff1aSopenharmony_ci// On exit: 188cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 189cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x4_neon, export=1 190cabdff1aSopenharmony_ci ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 191cabdff1aSopenharmony_ci mov x3, x0 192cabdff1aSopenharmony_ci ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] 193cabdff1aSopenharmony_ci ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector 194cabdff1aSopenharmony_ci ld1 {v5.8b}, [x0], x1 195cabdff1aSopenharmony_ci trn2 v6.4h, v1.4h, v3.4h 196cabdff1aSopenharmony_ci trn2 v7.4h, v2.4h, v4.4h 197cabdff1aSopenharmony_ci trn1 v1.4h, v1.4h, v3.4h 198cabdff1aSopenharmony_ci trn1 v2.4h, v2.4h, v4.4h 199cabdff1aSopenharmony_ci trn2 v3.4h, v16.4h, v18.4h 200cabdff1aSopenharmony_ci trn2 v4.4h, v17.4h, v19.4h 201cabdff1aSopenharmony_ci trn1 v16.4h, v16.4h, v18.4h 202cabdff1aSopenharmony_ci trn1 v17.4h, v17.4h, v19.4h 203cabdff1aSopenharmony_ci ld1 {v18.8b}, [x0], x1 204cabdff1aSopenharmony_ci trn1 v19.2s, v6.2s, v3.2s 205cabdff1aSopenharmony_ci trn2 v3.2s, v6.2s, v3.2s 206cabdff1aSopenharmony_ci trn1 v6.2s, v7.2s, v4.2s 207cabdff1aSopenharmony_ci trn2 v4.2s, v7.2s, v4.2s 208cabdff1aSopenharmony_ci trn1 v7.2s, v1.2s, v16.2s 209cabdff1aSopenharmony_ci trn1 v20.2s, v2.2s, v17.2s 210cabdff1aSopenharmony_ci shl v21.4h, v19.4h, #4 // 16 * src[1] 211cabdff1aSopenharmony_ci trn2 v1.2s, v1.2s, v16.2s 212cabdff1aSopenharmony_ci shl v16.4h, v3.4h, #4 // 16 * src[3] 213cabdff1aSopenharmony_ci trn2 v2.2s, v2.2s, v17.2s 214cabdff1aSopenharmony_ci shl v17.4h, v6.4h, #4 // 16 * src[5] 215cabdff1aSopenharmony_ci ld1 {v22.8b}, [x0], x1 216cabdff1aSopenharmony_ci shl v23.4h, v4.4h, #4 // 16 * src[7] 217cabdff1aSopenharmony_ci mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] 218cabdff1aSopenharmony_ci ld1 {v25.8b}, [x0] 219cabdff1aSopenharmony_ci shl v26.4h, v19.4h, #2 // 4 * src[1] 220cabdff1aSopenharmony_ci shl v27.4h, v6.4h, #2 // 4 * src[5] 221cabdff1aSopenharmony_ci ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] 222cabdff1aSopenharmony_ci ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] 223cabdff1aSopenharmony_ci sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] 224cabdff1aSopenharmony_ci sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] 225cabdff1aSopenharmony_ci shl v7.4h, v7.4h, #2 // 8/2 * src[0] 226cabdff1aSopenharmony_ci shl v20.4h, v20.4h, #2 // 8/2 * src[4] 227cabdff1aSopenharmony_ci mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] 228cabdff1aSopenharmony_ci shl v1.4h, v1.4h, #3 // 16/2 * src[2] 229cabdff1aSopenharmony_ci mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] 230cabdff1aSopenharmony_ci ssra v7.4h, v7.4h, #1 // 12/2 * src[0] 231cabdff1aSopenharmony_ci mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] 232cabdff1aSopenharmony_ci ssra v20.4h, v20.4h, #1 // 12/2 * src[4] 233cabdff1aSopenharmony_ci mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] 234cabdff1aSopenharmony_ci shl v3.4h, v2.4h, #3 // 16/2 * src[6] 235cabdff1aSopenharmony_ci mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] 236cabdff1aSopenharmony_ci mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] 237cabdff1aSopenharmony_ci mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] 238cabdff1aSopenharmony_ci sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] 239cabdff1aSopenharmony_ci mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] 240cabdff1aSopenharmony_ci add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] 241cabdff1aSopenharmony_ci mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] 242cabdff1aSopenharmony_ci sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] 243cabdff1aSopenharmony_ci neg v6.4h, v21.4h // -t1 244cabdff1aSopenharmony_ci add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 245cabdff1aSopenharmony_ci sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 246cabdff1aSopenharmony_ci add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 247cabdff1aSopenharmony_ci sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 248cabdff1aSopenharmony_ci add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 249cabdff1aSopenharmony_ci add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 250cabdff1aSopenharmony_ci sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 251cabdff1aSopenharmony_ci sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 252cabdff1aSopenharmony_ci neg v3.4h, v17.4h // +t2 253cabdff1aSopenharmony_ci neg v4.4h, v16.4h // +t3 254cabdff1aSopenharmony_ci neg v28.4h, v23.4h // +t4 255cabdff1aSopenharmony_ci ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 256cabdff1aSopenharmony_ci ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 257cabdff1aSopenharmony_ci ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 258cabdff1aSopenharmony_ci ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 259cabdff1aSopenharmony_ci ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 260cabdff1aSopenharmony_ci ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 261cabdff1aSopenharmony_ci ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 262cabdff1aSopenharmony_ci ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 263cabdff1aSopenharmony_ci trn1 v1.2d, v7.2d, v1.2d 264cabdff1aSopenharmony_ci trn1 v2.2d, v20.2d, v2.2d 265cabdff1aSopenharmony_ci trn1 v3.2d, v24.2d, v27.2d 266cabdff1aSopenharmony_ci trn1 v4.2d, v19.2d, v26.2d 267cabdff1aSopenharmony_ci srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 268cabdff1aSopenharmony_ci srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 269cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 270cabdff1aSopenharmony_ci srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 271cabdff1aSopenharmony_ci trn2 v6.8h, v1.8h, v2.8h 272cabdff1aSopenharmony_ci trn1 v1.8h, v1.8h, v2.8h 273cabdff1aSopenharmony_ci trn2 v2.8h, v3.8h, v4.8h 274cabdff1aSopenharmony_ci trn1 v3.8h, v3.8h, v4.8h 275cabdff1aSopenharmony_ci trn2 v4.4s, v6.4s, v2.4s 276cabdff1aSopenharmony_ci trn1 v7.4s, v1.4s, v3.4s 277cabdff1aSopenharmony_ci trn2 v1.4s, v1.4s, v3.4s 278cabdff1aSopenharmony_ci mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] 279cabdff1aSopenharmony_ci trn1 v2.4s, v6.4s, v2.4s 280cabdff1aSopenharmony_ci mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] 281cabdff1aSopenharmony_ci mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] 282cabdff1aSopenharmony_ci mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] 283cabdff1aSopenharmony_ci mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] 284cabdff1aSopenharmony_ci mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] 285cabdff1aSopenharmony_ci add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] 286cabdff1aSopenharmony_ci sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] 287cabdff1aSopenharmony_ci neg v2.8h, v3.8h // -t4/2 288cabdff1aSopenharmony_ci neg v6.8h, v4.8h // -t3/2 289cabdff1aSopenharmony_ci ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 290cabdff1aSopenharmony_ci ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 291cabdff1aSopenharmony_ci ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 292cabdff1aSopenharmony_ci ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 293cabdff1aSopenharmony_ci srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 294cabdff1aSopenharmony_ci srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 295cabdff1aSopenharmony_ci srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 296cabdff1aSopenharmony_ci srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 297cabdff1aSopenharmony_ci uaddw v0.8h, v0.8h, v5.8b 298cabdff1aSopenharmony_ci uaddw v1.8h, v1.8h, v18.8b 299cabdff1aSopenharmony_ci uaddw v2.8h, v2.8h, v22.8b 300cabdff1aSopenharmony_ci uaddw v3.8h, v3.8h, v25.8b 301cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 302cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 303cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 304cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 305cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x1 306cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x1 307cabdff1aSopenharmony_ci st1 {v2.8b}, [x3], x1 308cabdff1aSopenharmony_ci st1 {v3.8b}, [x3] 309cabdff1aSopenharmony_ci ret 310cabdff1aSopenharmony_ciendfunc 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci// VC-1 4x8 inverse transform 313cabdff1aSopenharmony_ci// On entry: 314cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 315cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 316cabdff1aSopenharmony_ci// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) 317cabdff1aSopenharmony_ci// On exit: 318cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 319cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x8_neon, export=1 320cabdff1aSopenharmony_ci mov x3, #16 321cabdff1aSopenharmony_ci ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector 322cabdff1aSopenharmony_ci mov x4, x0 323cabdff1aSopenharmony_ci ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 324cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 325cabdff1aSopenharmony_ci ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 326cabdff1aSopenharmony_ci ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 327cabdff1aSopenharmony_ci ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 328cabdff1aSopenharmony_ci ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 329cabdff1aSopenharmony_ci ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 330cabdff1aSopenharmony_ci ld1 {v4.d}[1], [x2] // 70 71 72 73 331cabdff1aSopenharmony_ci ld1 {v5.s}[0], [x0], x1 332cabdff1aSopenharmony_ci ld1 {v6.s}[0], [x0], x1 333cabdff1aSopenharmony_ci ld1 {v7.s}[0], [x0], x1 334cabdff1aSopenharmony_ci trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 335cabdff1aSopenharmony_ci trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 336cabdff1aSopenharmony_ci trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 337cabdff1aSopenharmony_ci trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 338cabdff1aSopenharmony_ci ld1 {v4.s}[0], [x0], x1 339cabdff1aSopenharmony_ci trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 340cabdff1aSopenharmony_ci trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 341cabdff1aSopenharmony_ci trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 342cabdff1aSopenharmony_ci mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] 343cabdff1aSopenharmony_ci ld1 {v5.s}[1], [x0], x1 344cabdff1aSopenharmony_ci mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] 345cabdff1aSopenharmony_ci ld1 {v6.s}[1], [x0], x1 346cabdff1aSopenharmony_ci trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 347cabdff1aSopenharmony_ci mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] 348cabdff1aSopenharmony_ci ld1 {v7.s}[1], [x0], x1 349cabdff1aSopenharmony_ci mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] 350cabdff1aSopenharmony_ci ld1 {v4.s}[1], [x0] 351cabdff1aSopenharmony_ci mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] 352cabdff1aSopenharmony_ci mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] 353cabdff1aSopenharmony_ci add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] 354cabdff1aSopenharmony_ci sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] 355cabdff1aSopenharmony_ci neg v3.8h, v16.8h // -t3/2 356cabdff1aSopenharmony_ci ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 357cabdff1aSopenharmony_ci neg v18.8h, v17.8h // -t4/2 358cabdff1aSopenharmony_ci ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 359cabdff1aSopenharmony_ci ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 360cabdff1aSopenharmony_ci ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 361cabdff1aSopenharmony_ci srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 362cabdff1aSopenharmony_ci srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 363cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 364cabdff1aSopenharmony_ci srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 365cabdff1aSopenharmony_ci trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 366cabdff1aSopenharmony_ci trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 367cabdff1aSopenharmony_ci trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 368cabdff1aSopenharmony_ci trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 369cabdff1aSopenharmony_ci trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 370cabdff1aSopenharmony_ci trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 371cabdff1aSopenharmony_ci trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 372cabdff1aSopenharmony_ci mov d18, v3.d[1] // 50 51 52 53 373cabdff1aSopenharmony_ci shl v19.4h, v3.4h, #4 // 16 * src[8] 374cabdff1aSopenharmony_ci mov d20, v16.d[1] // 70 71 72 73 375cabdff1aSopenharmony_ci shl v21.4h, v16.4h, #4 // 16 * src[24] 376cabdff1aSopenharmony_ci mov d22, v17.d[1] // 40 41 42 43 377cabdff1aSopenharmony_ci shl v23.4h, v3.4h, #2 // 4 * src[8] 378cabdff1aSopenharmony_ci shl v24.4h, v18.4h, #4 // 16 * src[40] 379cabdff1aSopenharmony_ci shl v25.4h, v20.4h, #4 // 16 * src[56] 380cabdff1aSopenharmony_ci shl v26.4h, v18.4h, #2 // 4 * src[40] 381cabdff1aSopenharmony_ci trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 382cabdff1aSopenharmony_ci ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] 383cabdff1aSopenharmony_ci sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] 384cabdff1aSopenharmony_ci shl v17.4h, v17.4h, #2 // 8/2 * src[0] 385cabdff1aSopenharmony_ci sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] 386cabdff1aSopenharmony_ci shl v22.4h, v22.4h, #2 // 8/2 * src[32] 387cabdff1aSopenharmony_ci mov d23, v1.d[1] // 60 61 62 63 388cabdff1aSopenharmony_ci ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] 389cabdff1aSopenharmony_ci mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] 390cabdff1aSopenharmony_ci shl v1.4h, v1.4h, #3 // 16/2 * src[16] 391cabdff1aSopenharmony_ci mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] 392cabdff1aSopenharmony_ci ssra v17.4h, v17.4h, #1 // 12/2 * src[0] 393cabdff1aSopenharmony_ci mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] 394cabdff1aSopenharmony_ci ssra v22.4h, v22.4h, #1 // 12/2 * src[32] 395cabdff1aSopenharmony_ci mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] 396cabdff1aSopenharmony_ci shl v3.4h, v23.4h, #3 // 16/2 * src[48] 397cabdff1aSopenharmony_ci mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] 398cabdff1aSopenharmony_ci mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] 399cabdff1aSopenharmony_ci mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] 400cabdff1aSopenharmony_ci add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] 401cabdff1aSopenharmony_ci sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] 402cabdff1aSopenharmony_ci sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] 403cabdff1aSopenharmony_ci mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] 404cabdff1aSopenharmony_ci mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] 405cabdff1aSopenharmony_ci add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 406cabdff1aSopenharmony_ci mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] 407cabdff1aSopenharmony_ci sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 408cabdff1aSopenharmony_ci add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 409cabdff1aSopenharmony_ci sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 410cabdff1aSopenharmony_ci neg v23.4h, v24.4h // +t2 411cabdff1aSopenharmony_ci sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 412cabdff1aSopenharmony_ci add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 413cabdff1aSopenharmony_ci neg v17.4h, v21.4h // +t3 414cabdff1aSopenharmony_ci sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 415cabdff1aSopenharmony_ci add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 416cabdff1aSopenharmony_ci neg v16.4h, v19.4h // -t1 417cabdff1aSopenharmony_ci neg v27.4h, v2.4h // +t4 418cabdff1aSopenharmony_ci ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 419cabdff1aSopenharmony_ci srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 420cabdff1aSopenharmony_ci ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 421cabdff1aSopenharmony_ci srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 422cabdff1aSopenharmony_ci ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 423cabdff1aSopenharmony_ci srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 424cabdff1aSopenharmony_ci ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 425cabdff1aSopenharmony_ci srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 426cabdff1aSopenharmony_ci trn1 v0.2d, v20.2d, v0.2d 427cabdff1aSopenharmony_ci trn1 v2.2d, v18.2d, v22.2d 428cabdff1aSopenharmony_ci trn1 v3.2d, v25.2d, v3.2d 429cabdff1aSopenharmony_ci trn1 v1.2d, v26.2d, v1.2d 430cabdff1aSopenharmony_ci srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 431cabdff1aSopenharmony_ci srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 432cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 433cabdff1aSopenharmony_ci srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 434cabdff1aSopenharmony_ci uaddw v0.8h, v0.8h, v5.8b 435cabdff1aSopenharmony_ci uaddw v2.8h, v2.8h, v6.8b 436cabdff1aSopenharmony_ci uaddw v3.8h, v3.8h, v7.8b 437cabdff1aSopenharmony_ci uaddw v1.8h, v1.8h, v4.8b 438cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 439cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 440cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 441cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 442cabdff1aSopenharmony_ci st1 {v0.s}[0], [x4], x1 443cabdff1aSopenharmony_ci st1 {v2.s}[0], [x4], x1 444cabdff1aSopenharmony_ci st1 {v3.s}[0], [x4], x1 445cabdff1aSopenharmony_ci st1 {v1.s}[0], [x4], x1 446cabdff1aSopenharmony_ci st1 {v0.s}[1], [x4], x1 447cabdff1aSopenharmony_ci st1 {v2.s}[1], [x4], x1 448cabdff1aSopenharmony_ci st1 {v3.s}[1], [x4], x1 449cabdff1aSopenharmony_ci st1 {v1.s}[1], [x4] 450cabdff1aSopenharmony_ci ret 451cabdff1aSopenharmony_ciendfunc 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci// VC-1 4x4 inverse transform 454cabdff1aSopenharmony_ci// On entry: 455cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 456cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 457cabdff1aSopenharmony_ci// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) 458cabdff1aSopenharmony_ci// On exit: 459cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 460cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x4_neon, export=1 461cabdff1aSopenharmony_ci mov x3, #16 462cabdff1aSopenharmony_ci ldr d0, .Lcoeffs_it4 463cabdff1aSopenharmony_ci mov x4, x0 464cabdff1aSopenharmony_ci ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 465cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 466cabdff1aSopenharmony_ci ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 467cabdff1aSopenharmony_ci ld1 {v4.d}[0], [x2] // 30 31 32 33 468cabdff1aSopenharmony_ci ld1 {v5.s}[0], [x0], x1 469cabdff1aSopenharmony_ci ld1 {v5.s}[1], [x0], x1 470cabdff1aSopenharmony_ci ld1 {v6.s}[0], [x0], x1 471cabdff1aSopenharmony_ci trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 472cabdff1aSopenharmony_ci trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 473cabdff1aSopenharmony_ci ld1 {v6.s}[1], [x0] 474cabdff1aSopenharmony_ci trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 475cabdff1aSopenharmony_ci trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 476cabdff1aSopenharmony_ci trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 477cabdff1aSopenharmony_ci trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 478cabdff1aSopenharmony_ci trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 479cabdff1aSopenharmony_ci trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 480cabdff1aSopenharmony_ci mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] 481cabdff1aSopenharmony_ci mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] 482cabdff1aSopenharmony_ci mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] 483cabdff1aSopenharmony_ci mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] 484cabdff1aSopenharmony_ci mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] 485cabdff1aSopenharmony_ci mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] 486cabdff1aSopenharmony_ci add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] 487cabdff1aSopenharmony_ci sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] 488cabdff1aSopenharmony_ci neg v7.4h, v3.4h // -t3/2 489cabdff1aSopenharmony_ci neg v16.4h, v4.4h // -t4/2 490cabdff1aSopenharmony_ci ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 491cabdff1aSopenharmony_ci ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 492cabdff1aSopenharmony_ci ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 493cabdff1aSopenharmony_ci ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 494cabdff1aSopenharmony_ci srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 495cabdff1aSopenharmony_ci srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 496cabdff1aSopenharmony_ci srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 497cabdff1aSopenharmony_ci srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 498cabdff1aSopenharmony_ci trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 499cabdff1aSopenharmony_ci trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 500cabdff1aSopenharmony_ci trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 501cabdff1aSopenharmony_ci trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 502cabdff1aSopenharmony_ci trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 503cabdff1aSopenharmony_ci trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 504cabdff1aSopenharmony_ci trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 505cabdff1aSopenharmony_ci trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 506cabdff1aSopenharmony_ci mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] 507cabdff1aSopenharmony_ci mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] 508cabdff1aSopenharmony_ci mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] 509cabdff1aSopenharmony_ci mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] 510cabdff1aSopenharmony_ci mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] 511cabdff1aSopenharmony_ci mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] 512cabdff1aSopenharmony_ci add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] 513cabdff1aSopenharmony_ci sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] 514cabdff1aSopenharmony_ci neg v3.4h, v2.4h // -t4/2 515cabdff1aSopenharmony_ci neg v7.4h, v4.4h // -t3/2 516cabdff1aSopenharmony_ci ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 517cabdff1aSopenharmony_ci ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 518cabdff1aSopenharmony_ci ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 519cabdff1aSopenharmony_ci ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 520cabdff1aSopenharmony_ci trn1 v0.2d, v4.2d, v3.2d 521cabdff1aSopenharmony_ci trn1 v1.2d, v2.2d, v7.2d 522cabdff1aSopenharmony_ci srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 523cabdff1aSopenharmony_ci srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 524cabdff1aSopenharmony_ci uaddw v0.8h, v0.8h, v5.8b 525cabdff1aSopenharmony_ci uaddw v1.8h, v1.8h, v6.8b 526cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 527cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 528cabdff1aSopenharmony_ci st1 {v0.s}[0], [x4], x1 529cabdff1aSopenharmony_ci st1 {v0.s}[1], [x4], x1 530cabdff1aSopenharmony_ci st1 {v1.s}[0], [x4], x1 531cabdff1aSopenharmony_ci st1 {v1.s}[1], [x4] 532cabdff1aSopenharmony_ci ret 533cabdff1aSopenharmony_ciendfunc 534cabdff1aSopenharmony_ci 535cabdff1aSopenharmony_ci// VC-1 8x8 inverse transform, DC case 536cabdff1aSopenharmony_ci// On entry: 537cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 538cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 539cabdff1aSopenharmony_ci// x2 -> 16-bit inverse transform DC coefficient 540cabdff1aSopenharmony_ci// On exit: 541cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 542cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x8_dc_neon, export=1 543cabdff1aSopenharmony_ci ldrsh w2, [x2] 544cabdff1aSopenharmony_ci mov x3, x0 545cabdff1aSopenharmony_ci ld1 {v0.8b}, [x0], x1 546cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x1 547cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x1 548cabdff1aSopenharmony_ci add w2, w2, w2, lsl #1 549cabdff1aSopenharmony_ci ld1 {v3.8b}, [x0], x1 550cabdff1aSopenharmony_ci ld1 {v4.8b}, [x0], x1 551cabdff1aSopenharmony_ci add w2, w2, #1 552cabdff1aSopenharmony_ci ld1 {v5.8b}, [x0], x1 553cabdff1aSopenharmony_ci asr w2, w2, #1 554cabdff1aSopenharmony_ci ld1 {v6.8b}, [x0], x1 555cabdff1aSopenharmony_ci add w2, w2, w2, lsl #1 556cabdff1aSopenharmony_ci ld1 {v7.8b}, [x0] 557cabdff1aSopenharmony_ci add w0, w2, #16 558cabdff1aSopenharmony_ci asr w0, w0, #5 559cabdff1aSopenharmony_ci dup v16.8h, w0 560cabdff1aSopenharmony_ci uaddw v0.8h, v16.8h, v0.8b 561cabdff1aSopenharmony_ci uaddw v1.8h, v16.8h, v1.8b 562cabdff1aSopenharmony_ci uaddw v2.8h, v16.8h, v2.8b 563cabdff1aSopenharmony_ci uaddw v3.8h, v16.8h, v3.8b 564cabdff1aSopenharmony_ci uaddw v4.8h, v16.8h, v4.8b 565cabdff1aSopenharmony_ci uaddw v5.8h, v16.8h, v5.8b 566cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 567cabdff1aSopenharmony_ci uaddw v6.8h, v16.8h, v6.8b 568cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 569cabdff1aSopenharmony_ci uaddw v7.8h, v16.8h, v7.8b 570cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 571cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 572cabdff1aSopenharmony_ci sqxtun v4.8b, v4.8h 573cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x1 574cabdff1aSopenharmony_ci sqxtun v0.8b, v5.8h 575cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x1 576cabdff1aSopenharmony_ci sqxtun v1.8b, v6.8h 577cabdff1aSopenharmony_ci st1 {v2.8b}, [x3], x1 578cabdff1aSopenharmony_ci sqxtun v2.8b, v7.8h 579cabdff1aSopenharmony_ci st1 {v3.8b}, [x3], x1 580cabdff1aSopenharmony_ci st1 {v4.8b}, [x3], x1 581cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x1 582cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x1 583cabdff1aSopenharmony_ci st1 {v2.8b}, [x3] 584cabdff1aSopenharmony_ci ret 585cabdff1aSopenharmony_ciendfunc 586cabdff1aSopenharmony_ci 587cabdff1aSopenharmony_ci// VC-1 8x4 inverse transform, DC case 588cabdff1aSopenharmony_ci// On entry: 589cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 590cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 591cabdff1aSopenharmony_ci// x2 -> 16-bit inverse transform DC coefficient 592cabdff1aSopenharmony_ci// On exit: 593cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 594cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x4_dc_neon, export=1 595cabdff1aSopenharmony_ci ldrsh w2, [x2] 596cabdff1aSopenharmony_ci mov x3, x0 597cabdff1aSopenharmony_ci ld1 {v0.8b}, [x0], x1 598cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x1 599cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x1 600cabdff1aSopenharmony_ci add w2, w2, w2, lsl #1 601cabdff1aSopenharmony_ci ld1 {v3.8b}, [x0] 602cabdff1aSopenharmony_ci add w0, w2, #1 603cabdff1aSopenharmony_ci asr w0, w0, #1 604cabdff1aSopenharmony_ci add w0, w0, w0, lsl #4 605cabdff1aSopenharmony_ci add w0, w0, #64 606cabdff1aSopenharmony_ci asr w0, w0, #7 607cabdff1aSopenharmony_ci dup v4.8h, w0 608cabdff1aSopenharmony_ci uaddw v0.8h, v4.8h, v0.8b 609cabdff1aSopenharmony_ci uaddw v1.8h, v4.8h, v1.8b 610cabdff1aSopenharmony_ci uaddw v2.8h, v4.8h, v2.8b 611cabdff1aSopenharmony_ci uaddw v3.8h, v4.8h, v3.8b 612cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 613cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 614cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 615cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 616cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x1 617cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x1 618cabdff1aSopenharmony_ci st1 {v2.8b}, [x3], x1 619cabdff1aSopenharmony_ci st1 {v3.8b}, [x3] 620cabdff1aSopenharmony_ci ret 621cabdff1aSopenharmony_ciendfunc 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_ci// VC-1 4x8 inverse transform, DC case 624cabdff1aSopenharmony_ci// On entry: 625cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 626cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 627cabdff1aSopenharmony_ci// x2 -> 16-bit inverse transform DC coefficient 628cabdff1aSopenharmony_ci// On exit: 629cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 630cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x8_dc_neon, export=1 631cabdff1aSopenharmony_ci ldrsh w2, [x2] 632cabdff1aSopenharmony_ci mov x3, x0 633cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x0], x1 634cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x0], x1 635cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x0], x1 636cabdff1aSopenharmony_ci add w2, w2, w2, lsl #4 637cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x0], x1 638cabdff1aSopenharmony_ci add w2, w2, #4 639cabdff1aSopenharmony_ci asr w2, w2, #3 640cabdff1aSopenharmony_ci add w2, w2, w2, lsl #1 641cabdff1aSopenharmony_ci ld1 {v0.s}[1], [x0], x1 642cabdff1aSopenharmony_ci add w2, w2, #16 643cabdff1aSopenharmony_ci asr w2, w2, #5 644cabdff1aSopenharmony_ci dup v4.8h, w2 645cabdff1aSopenharmony_ci ld1 {v1.s}[1], [x0], x1 646cabdff1aSopenharmony_ci ld1 {v2.s}[1], [x0], x1 647cabdff1aSopenharmony_ci ld1 {v3.s}[1], [x0] 648cabdff1aSopenharmony_ci uaddw v0.8h, v4.8h, v0.8b 649cabdff1aSopenharmony_ci uaddw v1.8h, v4.8h, v1.8b 650cabdff1aSopenharmony_ci uaddw v2.8h, v4.8h, v2.8b 651cabdff1aSopenharmony_ci uaddw v3.8h, v4.8h, v3.8b 652cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 653cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 654cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 655cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 656cabdff1aSopenharmony_ci st1 {v0.s}[0], [x3], x1 657cabdff1aSopenharmony_ci st1 {v1.s}[0], [x3], x1 658cabdff1aSopenharmony_ci st1 {v2.s}[0], [x3], x1 659cabdff1aSopenharmony_ci st1 {v3.s}[0], [x3], x1 660cabdff1aSopenharmony_ci st1 {v0.s}[1], [x3], x1 661cabdff1aSopenharmony_ci st1 {v1.s}[1], [x3], x1 662cabdff1aSopenharmony_ci st1 {v2.s}[1], [x3], x1 663cabdff1aSopenharmony_ci st1 {v3.s}[1], [x3] 664cabdff1aSopenharmony_ci ret 665cabdff1aSopenharmony_ciendfunc 666cabdff1aSopenharmony_ci 667cabdff1aSopenharmony_ci// VC-1 4x4 inverse transform, DC case 668cabdff1aSopenharmony_ci// On entry: 669cabdff1aSopenharmony_ci// x0 -> array of 8-bit samples, in row-major order 670cabdff1aSopenharmony_ci// x1 = row stride for 8-bit sample array 671cabdff1aSopenharmony_ci// x2 -> 16-bit inverse transform DC coefficient 672cabdff1aSopenharmony_ci// On exit: 673cabdff1aSopenharmony_ci// array at x0 updated by saturated addition of (narrowed) transformed block 674cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x4_dc_neon, export=1 675cabdff1aSopenharmony_ci ldrsh w2, [x2] 676cabdff1aSopenharmony_ci mov x3, x0 677cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x0], x1 678cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x0], x1 679cabdff1aSopenharmony_ci ld1 {v0.s}[1], [x0], x1 680cabdff1aSopenharmony_ci add w2, w2, w2, lsl #4 681cabdff1aSopenharmony_ci ld1 {v1.s}[1], [x0] 682cabdff1aSopenharmony_ci add w0, w2, #4 683cabdff1aSopenharmony_ci asr w0, w0, #3 684cabdff1aSopenharmony_ci add w0, w0, w0, lsl #4 685cabdff1aSopenharmony_ci add w0, w0, #64 686cabdff1aSopenharmony_ci asr w0, w0, #7 687cabdff1aSopenharmony_ci dup v2.8h, w0 688cabdff1aSopenharmony_ci uaddw v0.8h, v2.8h, v0.8b 689cabdff1aSopenharmony_ci uaddw v1.8h, v2.8h, v1.8b 690cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 691cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 692cabdff1aSopenharmony_ci st1 {v0.s}[0], [x3], x1 693cabdff1aSopenharmony_ci st1 {v1.s}[0], [x3], x1 694cabdff1aSopenharmony_ci st1 {v0.s}[1], [x3], x1 695cabdff1aSopenharmony_ci st1 {v1.s}[1], [x3] 696cabdff1aSopenharmony_ci ret 697cabdff1aSopenharmony_ciendfunc 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci.align 5 700cabdff1aSopenharmony_ci.Lcoeffs_it8: 701cabdff1aSopenharmony_ci.quad 0x000F00090003 702cabdff1aSopenharmony_ci.Lcoeffs_it4: 703cabdff1aSopenharmony_ci.quad 0x0011000B0005 704cabdff1aSopenharmony_ci.Lcoeffs: 705cabdff1aSopenharmony_ci.quad 0x00050002 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks 708cabdff1aSopenharmony_ci// On entry: 709cabdff1aSopenharmony_ci// x0 -> top-left pel of lower block 710cabdff1aSopenharmony_ci// x1 = row stride, bytes 711cabdff1aSopenharmony_ci// w2 = PQUANT bitstream parameter 712cabdff1aSopenharmony_cifunction ff_vc1_v_loop_filter4_neon, export=1 713cabdff1aSopenharmony_ci sub x3, x0, w1, sxtw #2 714cabdff1aSopenharmony_ci ldr d0, .Lcoeffs 715cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x0], x1 // P5 716cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x3], x1 // P1 717cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x3], x1 // P2 718cabdff1aSopenharmony_ci ld1 {v4.s}[0], [x0], x1 // P6 719cabdff1aSopenharmony_ci ld1 {v5.s}[0], [x3], x1 // P3 720cabdff1aSopenharmony_ci ld1 {v6.s}[0], [x0], x1 // P7 721cabdff1aSopenharmony_ci ld1 {v7.s}[0], [x3] // P4 722cabdff1aSopenharmony_ci ld1 {v16.s}[0], [x0] // P8 723cabdff1aSopenharmony_ci ushll v17.8h, v1.8b, #1 // 2*P5 724cabdff1aSopenharmony_ci dup v18.8h, w2 // pq 725cabdff1aSopenharmony_ci ushll v2.8h, v2.8b, #1 // 2*P1 726cabdff1aSopenharmony_ci uxtl v3.8h, v3.8b // P2 727cabdff1aSopenharmony_ci uxtl v4.8h, v4.8b // P6 728cabdff1aSopenharmony_ci uxtl v19.8h, v5.8b // P3 729cabdff1aSopenharmony_ci mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 730cabdff1aSopenharmony_ci uxtl v3.8h, v6.8b // P7 731cabdff1aSopenharmony_ci mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 732cabdff1aSopenharmony_ci ushll v5.8h, v5.8b, #1 // 2*P3 733cabdff1aSopenharmony_ci uxtl v6.8h, v7.8b // P4 734cabdff1aSopenharmony_ci mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 735cabdff1aSopenharmony_ci uxtl v3.8h, v16.8b // P8 736cabdff1aSopenharmony_ci mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 737cabdff1aSopenharmony_ci uxtl v1.8h, v1.8b // P5 738cabdff1aSopenharmony_ci mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 739cabdff1aSopenharmony_ci mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 740cabdff1aSopenharmony_ci sub v3.4h, v6.4h, v1.4h // P4-P5 741cabdff1aSopenharmony_ci mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 742cabdff1aSopenharmony_ci mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 743cabdff1aSopenharmony_ci mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 744cabdff1aSopenharmony_ci abs v4.4h, v3.4h 745cabdff1aSopenharmony_ci srshr v7.4h, v17.4h, #3 746cabdff1aSopenharmony_ci srshr v2.4h, v2.4h, #3 747cabdff1aSopenharmony_ci sshr v4.4h, v4.4h, #1 // clip 748cabdff1aSopenharmony_ci srshr v5.4h, v5.4h, #3 749cabdff1aSopenharmony_ci abs v7.4h, v7.4h // a2 750cabdff1aSopenharmony_ci sshr v3.4h, v3.4h, #8 // clip_sign 751cabdff1aSopenharmony_ci abs v2.4h, v2.4h // a1 752cabdff1aSopenharmony_ci cmeq v16.4h, v4.4h, #0 // test clip == 0 753cabdff1aSopenharmony_ci abs v17.4h, v5.4h // a0 754cabdff1aSopenharmony_ci sshr v5.4h, v5.4h, #8 // a0_sign 755cabdff1aSopenharmony_ci cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 756cabdff1aSopenharmony_ci cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq 757cabdff1aSopenharmony_ci sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign 758cabdff1aSopenharmony_ci bsl v19.8b, v7.8b, v2.8b // a3 759cabdff1aSopenharmony_ci orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq 760cabdff1aSopenharmony_ci uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 761cabdff1aSopenharmony_ci cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 762cabdff1aSopenharmony_ci mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 763cabdff1aSopenharmony_ci orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 764cabdff1aSopenharmony_ci mov w0, v5.s[1] // move to gp reg 765cabdff1aSopenharmony_ci ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 766cabdff1aSopenharmony_ci cmhs v5.4h, v0.4h, v4.4h 767cabdff1aSopenharmony_ci tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered 768cabdff1aSopenharmony_ci bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) 769cabdff1aSopenharmony_ci bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 770cabdff1aSopenharmony_ci mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 771cabdff1aSopenharmony_ci mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 772cabdff1aSopenharmony_ci sqxtun v0.8b, v6.8h 773cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 774cabdff1aSopenharmony_ci st1 {v0.s}[0], [x3], x1 775cabdff1aSopenharmony_ci st1 {v1.s}[0], [x3] 776cabdff1aSopenharmony_ci1: ret 777cabdff1aSopenharmony_ciendfunc 778cabdff1aSopenharmony_ci 779cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks 780cabdff1aSopenharmony_ci// On entry: 781cabdff1aSopenharmony_ci// x0 -> top-left pel of right block 782cabdff1aSopenharmony_ci// x1 = row stride, bytes 783cabdff1aSopenharmony_ci// w2 = PQUANT bitstream parameter 784cabdff1aSopenharmony_cifunction ff_vc1_h_loop_filter4_neon, export=1 785cabdff1aSopenharmony_ci sub x3, x0, #4 // where to start reading 786cabdff1aSopenharmony_ci ldr d0, .Lcoeffs 787cabdff1aSopenharmony_ci ld1 {v1.8b}, [x3], x1 788cabdff1aSopenharmony_ci sub x0, x0, #1 // where to start writing 789cabdff1aSopenharmony_ci ld1 {v2.8b}, [x3], x1 790cabdff1aSopenharmony_ci ld1 {v3.8b}, [x3], x1 791cabdff1aSopenharmony_ci ld1 {v4.8b}, [x3] 792cabdff1aSopenharmony_ci dup v5.8h, w2 // pq 793cabdff1aSopenharmony_ci trn1 v6.8b, v1.8b, v2.8b 794cabdff1aSopenharmony_ci trn2 v1.8b, v1.8b, v2.8b 795cabdff1aSopenharmony_ci trn1 v2.8b, v3.8b, v4.8b 796cabdff1aSopenharmony_ci trn2 v3.8b, v3.8b, v4.8b 797cabdff1aSopenharmony_ci trn1 v4.4h, v6.4h, v2.4h // P1, P5 798cabdff1aSopenharmony_ci trn1 v7.4h, v1.4h, v3.4h // P2, P6 799cabdff1aSopenharmony_ci trn2 v2.4h, v6.4h, v2.4h // P3, P7 800cabdff1aSopenharmony_ci trn2 v1.4h, v1.4h, v3.4h // P4, P8 801cabdff1aSopenharmony_ci ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 802cabdff1aSopenharmony_ci uxtl v6.8h, v7.8b // P2, P6 803cabdff1aSopenharmony_ci uxtl v7.8h, v2.8b // P3, P7 804cabdff1aSopenharmony_ci uxtl v1.8h, v1.8b // P4, P8 805cabdff1aSopenharmony_ci mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 806cabdff1aSopenharmony_ci ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 807cabdff1aSopenharmony_ci uxtl v4.8h, v4.8b // P1, P5 808cabdff1aSopenharmony_ci mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 809cabdff1aSopenharmony_ci mov d6, v6.d[1] // P6 810cabdff1aSopenharmony_ci mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 811cabdff1aSopenharmony_ci mov d4, v4.d[1] // P5 812cabdff1aSopenharmony_ci mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 813cabdff1aSopenharmony_ci mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 814cabdff1aSopenharmony_ci sub v7.4h, v1.4h, v4.4h // P4-P5 815cabdff1aSopenharmony_ci mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 816cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #3 817cabdff1aSopenharmony_ci abs v6.4h, v7.4h 818cabdff1aSopenharmony_ci sshr v7.4h, v7.4h, #8 // clip_sign 819cabdff1aSopenharmony_ci srshr v2.4h, v2.4h, #3 820cabdff1aSopenharmony_ci abs v3.8h, v3.8h // a1, a2 821cabdff1aSopenharmony_ci sshr v6.4h, v6.4h, #1 // clip 822cabdff1aSopenharmony_ci mov d16, v3.d[1] // a2 823cabdff1aSopenharmony_ci abs v17.4h, v2.4h // a0 824cabdff1aSopenharmony_ci cmeq v18.4h, v6.4h, #0 // test clip == 0 825cabdff1aSopenharmony_ci sshr v2.4h, v2.4h, #8 // a0_sign 826cabdff1aSopenharmony_ci cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 827cabdff1aSopenharmony_ci cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq 828cabdff1aSopenharmony_ci sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign 829cabdff1aSopenharmony_ci bsl v19.8b, v16.8b, v3.8b // a3 830cabdff1aSopenharmony_ci orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq 831cabdff1aSopenharmony_ci uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 832cabdff1aSopenharmony_ci cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 833cabdff1aSopenharmony_ci mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 834cabdff1aSopenharmony_ci orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 835cabdff1aSopenharmony_ci mov w2, v5.s[1] // move to gp reg 836cabdff1aSopenharmony_ci ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 837cabdff1aSopenharmony_ci cmhs v5.4h, v0.4h, v6.4h 838cabdff1aSopenharmony_ci tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered 839cabdff1aSopenharmony_ci bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) 840cabdff1aSopenharmony_ci bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 841cabdff1aSopenharmony_ci mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 842cabdff1aSopenharmony_ci mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 843cabdff1aSopenharmony_ci sqxtun v3.8b, v4.8h 844cabdff1aSopenharmony_ci sqxtun v2.8b, v1.8h 845cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[0], [x0], x1 846cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[1], [x0], x1 847cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[2], [x0], x1 848cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[3], [x0] 849cabdff1aSopenharmony_ci1: ret 850cabdff1aSopenharmony_ciendfunc 851cabdff1aSopenharmony_ci 852cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks 853cabdff1aSopenharmony_ci// On entry: 854cabdff1aSopenharmony_ci// x0 -> top-left pel of lower block 855cabdff1aSopenharmony_ci// x1 = row stride, bytes 856cabdff1aSopenharmony_ci// w2 = PQUANT bitstream parameter 857cabdff1aSopenharmony_cifunction ff_vc1_v_loop_filter8_neon, export=1 858cabdff1aSopenharmony_ci sub x3, x0, w1, sxtw #2 859cabdff1aSopenharmony_ci ldr d0, .Lcoeffs 860cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x1 // P5 861cabdff1aSopenharmony_ci movi v2.2d, #0x0000ffff00000000 862cabdff1aSopenharmony_ci ld1 {v3.8b}, [x3], x1 // P1 863cabdff1aSopenharmony_ci ld1 {v4.8b}, [x3], x1 // P2 864cabdff1aSopenharmony_ci ld1 {v5.8b}, [x0], x1 // P6 865cabdff1aSopenharmony_ci ld1 {v6.8b}, [x3], x1 // P3 866cabdff1aSopenharmony_ci ld1 {v7.8b}, [x0], x1 // P7 867cabdff1aSopenharmony_ci ushll v16.8h, v1.8b, #1 // 2*P5 868cabdff1aSopenharmony_ci ushll v3.8h, v3.8b, #1 // 2*P1 869cabdff1aSopenharmony_ci ld1 {v17.8b}, [x3] // P4 870cabdff1aSopenharmony_ci uxtl v4.8h, v4.8b // P2 871cabdff1aSopenharmony_ci ld1 {v18.8b}, [x0] // P8 872cabdff1aSopenharmony_ci uxtl v5.8h, v5.8b // P6 873cabdff1aSopenharmony_ci dup v19.8h, w2 // pq 874cabdff1aSopenharmony_ci uxtl v20.8h, v6.8b // P3 875cabdff1aSopenharmony_ci mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 876cabdff1aSopenharmony_ci uxtl v4.8h, v7.8b // P7 877cabdff1aSopenharmony_ci ushll v6.8h, v6.8b, #1 // 2*P3 878cabdff1aSopenharmony_ci mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 879cabdff1aSopenharmony_ci uxtl v7.8h, v17.8b // P4 880cabdff1aSopenharmony_ci uxtl v17.8h, v18.8b // P8 881cabdff1aSopenharmony_ci mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 882cabdff1aSopenharmony_ci uxtl v1.8h, v1.8b // P5 883cabdff1aSopenharmony_ci mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 884cabdff1aSopenharmony_ci sub v4.8h, v7.8h, v1.8h // P4-P5 885cabdff1aSopenharmony_ci mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 886cabdff1aSopenharmony_ci mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 887cabdff1aSopenharmony_ci abs v17.8h, v4.8h 888cabdff1aSopenharmony_ci sshr v4.8h, v4.8h, #8 // clip_sign 889cabdff1aSopenharmony_ci mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 890cabdff1aSopenharmony_ci sshr v17.8h, v17.8h, #1 // clip 891cabdff1aSopenharmony_ci mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 892cabdff1aSopenharmony_ci srshr v16.8h, v16.8h, #3 893cabdff1aSopenharmony_ci mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 894cabdff1aSopenharmony_ci cmeq v5.8h, v17.8h, #0 // test clip == 0 895cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #3 896cabdff1aSopenharmony_ci abs v16.8h, v16.8h // a2 897cabdff1aSopenharmony_ci abs v3.8h, v3.8h // a1 898cabdff1aSopenharmony_ci srshr v6.8h, v6.8h, #3 899cabdff1aSopenharmony_ci cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 900cabdff1aSopenharmony_ci abs v20.8h, v6.8h // a0 901cabdff1aSopenharmony_ci sshr v6.8h, v6.8h, #8 // a0_sign 902cabdff1aSopenharmony_ci bsl v18.16b, v16.16b, v3.16b // a3 903cabdff1aSopenharmony_ci cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq 904cabdff1aSopenharmony_ci sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign 905cabdff1aSopenharmony_ci uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 906cabdff1aSopenharmony_ci cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 907cabdff1aSopenharmony_ci orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq 908cabdff1aSopenharmony_ci mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 909cabdff1aSopenharmony_ci orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 910cabdff1aSopenharmony_ci cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either 911cabdff1aSopenharmony_ci mov w0, v5.s[1] // move to gp reg 912cabdff1aSopenharmony_ci ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 913cabdff1aSopenharmony_ci mov w2, v5.s[3] 914cabdff1aSopenharmony_ci orr v2.16b, v3.16b, v2.16b 915cabdff1aSopenharmony_ci cmhs v3.8h, v0.8h, v17.8h 916cabdff1aSopenharmony_ci and w0, w0, w2 917cabdff1aSopenharmony_ci bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) 918cabdff1aSopenharmony_ci tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case 919cabdff1aSopenharmony_ci bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered 920cabdff1aSopenharmony_ci mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 921cabdff1aSopenharmony_ci mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 922cabdff1aSopenharmony_ci sqxtun v0.8b, v7.8h 923cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 924cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x1 925cabdff1aSopenharmony_ci st1 {v1.8b}, [x3] 926cabdff1aSopenharmony_ci1: ret 927cabdff1aSopenharmony_ciendfunc 928cabdff1aSopenharmony_ci 929cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks 930cabdff1aSopenharmony_ci// On entry: 931cabdff1aSopenharmony_ci// x0 -> top-left pel of right block 932cabdff1aSopenharmony_ci// x1 = row stride, bytes 933cabdff1aSopenharmony_ci// w2 = PQUANT bitstream parameter 934cabdff1aSopenharmony_cifunction ff_vc1_h_loop_filter8_neon, export=1 935cabdff1aSopenharmony_ci sub x3, x0, #4 // where to start reading 936cabdff1aSopenharmony_ci ldr d0, .Lcoeffs 937cabdff1aSopenharmony_ci ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... 938cabdff1aSopenharmony_ci sub x0, x0, #1 // where to start writing 939cabdff1aSopenharmony_ci ld1 {v2.8b}, [x3], x1 940cabdff1aSopenharmony_ci add x4, x0, x1, lsl #2 941cabdff1aSopenharmony_ci ld1 {v3.8b}, [x3], x1 942cabdff1aSopenharmony_ci ld1 {v4.8b}, [x3], x1 943cabdff1aSopenharmony_ci ld1 {v5.8b}, [x3], x1 944cabdff1aSopenharmony_ci ld1 {v6.8b}, [x3], x1 945cabdff1aSopenharmony_ci ld1 {v7.8b}, [x3], x1 946cabdff1aSopenharmony_ci trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... 947cabdff1aSopenharmony_ci ld1 {v17.8b}, [x3] 948cabdff1aSopenharmony_ci trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... 949cabdff1aSopenharmony_ci trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... 950cabdff1aSopenharmony_ci trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... 951cabdff1aSopenharmony_ci dup v4.8h, w2 // pq 952cabdff1aSopenharmony_ci trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... 953cabdff1aSopenharmony_ci trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... 954cabdff1aSopenharmony_ci trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... 955cabdff1aSopenharmony_ci trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... 956cabdff1aSopenharmony_ci trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... 957cabdff1aSopenharmony_ci trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... 958cabdff1aSopenharmony_ci trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... 959cabdff1aSopenharmony_ci trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... 960cabdff1aSopenharmony_ci trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... 961cabdff1aSopenharmony_ci trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... 962cabdff1aSopenharmony_ci trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... 963cabdff1aSopenharmony_ci trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... 964cabdff1aSopenharmony_ci trn1 v7.2s, v6.2s, v3.2s // P1 965cabdff1aSopenharmony_ci trn1 v18.2s, v19.2s, v16.2s // P2 966cabdff1aSopenharmony_ci trn2 v3.2s, v6.2s, v3.2s // P5 967cabdff1aSopenharmony_ci trn2 v6.2s, v19.2s, v16.2s // P6 968cabdff1aSopenharmony_ci trn1 v16.2s, v2.2s, v17.2s // P3 969cabdff1aSopenharmony_ci trn2 v2.2s, v2.2s, v17.2s // P7 970cabdff1aSopenharmony_ci ushll v7.8h, v7.8b, #1 // 2*P1 971cabdff1aSopenharmony_ci trn1 v17.2s, v1.2s, v5.2s // P4 972cabdff1aSopenharmony_ci ushll v19.8h, v3.8b, #1 // 2*P5 973cabdff1aSopenharmony_ci trn2 v1.2s, v1.2s, v5.2s // P8 974cabdff1aSopenharmony_ci uxtl v5.8h, v18.8b // P2 975cabdff1aSopenharmony_ci uxtl v6.8h, v6.8b // P6 976cabdff1aSopenharmony_ci uxtl v18.8h, v16.8b // P3 977cabdff1aSopenharmony_ci mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 978cabdff1aSopenharmony_ci uxtl v2.8h, v2.8b // P7 979cabdff1aSopenharmony_ci ushll v5.8h, v16.8b, #1 // 2*P3 980cabdff1aSopenharmony_ci mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 981cabdff1aSopenharmony_ci uxtl v16.8h, v17.8b // P4 982cabdff1aSopenharmony_ci uxtl v1.8h, v1.8b // P8 983cabdff1aSopenharmony_ci mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 984cabdff1aSopenharmony_ci uxtl v2.8h, v3.8b // P5 985cabdff1aSopenharmony_ci mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 986cabdff1aSopenharmony_ci sub v3.8h, v16.8h, v2.8h // P4-P5 987cabdff1aSopenharmony_ci mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 988cabdff1aSopenharmony_ci mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 989cabdff1aSopenharmony_ci abs v1.8h, v3.8h 990cabdff1aSopenharmony_ci sshr v3.8h, v3.8h, #8 // clip_sign 991cabdff1aSopenharmony_ci mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 992cabdff1aSopenharmony_ci sshr v1.8h, v1.8h, #1 // clip 993cabdff1aSopenharmony_ci mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 994cabdff1aSopenharmony_ci srshr v17.8h, v19.8h, #3 995cabdff1aSopenharmony_ci mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 996cabdff1aSopenharmony_ci cmeq v6.8h, v1.8h, #0 // test clip == 0 997cabdff1aSopenharmony_ci srshr v7.8h, v7.8h, #3 998cabdff1aSopenharmony_ci abs v17.8h, v17.8h // a2 999cabdff1aSopenharmony_ci abs v7.8h, v7.8h // a1 1000cabdff1aSopenharmony_ci srshr v5.8h, v5.8h, #3 1001cabdff1aSopenharmony_ci cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 1002cabdff1aSopenharmony_ci abs v19.8h, v5.8h // a0 1003cabdff1aSopenharmony_ci sshr v5.8h, v5.8h, #8 // a0_sign 1004cabdff1aSopenharmony_ci bsl v18.16b, v17.16b, v7.16b // a3 1005cabdff1aSopenharmony_ci cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq 1006cabdff1aSopenharmony_ci sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign 1007cabdff1aSopenharmony_ci uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1008cabdff1aSopenharmony_ci cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 1009cabdff1aSopenharmony_ci orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq 1010cabdff1aSopenharmony_ci mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 1011cabdff1aSopenharmony_ci orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 1012cabdff1aSopenharmony_ci mov w2, v5.s[1] // move to gp reg 1013cabdff1aSopenharmony_ci ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 1014cabdff1aSopenharmony_ci mov w3, v5.s[3] 1015cabdff1aSopenharmony_ci cmhs v5.8h, v0.8h, v1.8h 1016cabdff1aSopenharmony_ci and w5, w2, w3 1017cabdff1aSopenharmony_ci bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) 1018cabdff1aSopenharmony_ci tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case 1019cabdff1aSopenharmony_ci bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 1020cabdff1aSopenharmony_ci mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 1021cabdff1aSopenharmony_ci mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 1022cabdff1aSopenharmony_ci sqxtun v1.8b, v2.8h 1023cabdff1aSopenharmony_ci sqxtun v0.8b, v16.8h 1024cabdff1aSopenharmony_ci tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so 1025cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[0], [x0], x1 1026cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[1], [x0], x1 1027cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[2], [x0], x1 1028cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[3], [x0] 1029cabdff1aSopenharmony_ci1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so 1030cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[4], [x4], x1 1031cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[5], [x4], x1 1032cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[6], [x4], x1 1033cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[7], [x4] 1034cabdff1aSopenharmony_ci2: ret 1035cabdff1aSopenharmony_ciendfunc 1036cabdff1aSopenharmony_ci 1037cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks 1038cabdff1aSopenharmony_ci// On entry: 1039cabdff1aSopenharmony_ci// x0 -> top-left pel of lower block 1040cabdff1aSopenharmony_ci// x1 = row stride, bytes 1041cabdff1aSopenharmony_ci// w2 = PQUANT bitstream parameter 1042cabdff1aSopenharmony_cifunction ff_vc1_v_loop_filter16_neon, export=1 1043cabdff1aSopenharmony_ci sub x3, x0, w1, sxtw #2 1044cabdff1aSopenharmony_ci ldr d0, .Lcoeffs 1045cabdff1aSopenharmony_ci ld1 {v1.16b}, [x0], x1 // P5 1046cabdff1aSopenharmony_ci movi v2.2d, #0x0000ffff00000000 1047cabdff1aSopenharmony_ci ld1 {v3.16b}, [x3], x1 // P1 1048cabdff1aSopenharmony_ci ld1 {v4.16b}, [x3], x1 // P2 1049cabdff1aSopenharmony_ci ld1 {v5.16b}, [x0], x1 // P6 1050cabdff1aSopenharmony_ci ld1 {v6.16b}, [x3], x1 // P3 1051cabdff1aSopenharmony_ci ld1 {v7.16b}, [x0], x1 // P7 1052cabdff1aSopenharmony_ci ushll v16.8h, v1.8b, #1 // 2*P5[0..7] 1053cabdff1aSopenharmony_ci ushll v17.8h, v3.8b, #1 // 2*P1[0..7] 1054cabdff1aSopenharmony_ci ld1 {v18.16b}, [x3] // P4 1055cabdff1aSopenharmony_ci uxtl v19.8h, v4.8b // P2[0..7] 1056cabdff1aSopenharmony_ci ld1 {v20.16b}, [x0] // P8 1057cabdff1aSopenharmony_ci uxtl v21.8h, v5.8b // P6[0..7] 1058cabdff1aSopenharmony_ci dup v22.8h, w2 // pq 1059cabdff1aSopenharmony_ci ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] 1060cabdff1aSopenharmony_ci mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] 1061cabdff1aSopenharmony_ci ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] 1062cabdff1aSopenharmony_ci uxtl2 v4.8h, v4.16b // P2[8..15] 1063cabdff1aSopenharmony_ci mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] 1064cabdff1aSopenharmony_ci uxtl2 v5.8h, v5.16b // P6[8..15] 1065cabdff1aSopenharmony_ci uxtl v23.8h, v6.8b // P3[0..7] 1066cabdff1aSopenharmony_ci uxtl v24.8h, v7.8b // P7[0..7] 1067cabdff1aSopenharmony_ci mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] 1068cabdff1aSopenharmony_ci ushll v4.8h, v6.8b, #1 // 2*P3[0..7] 1069cabdff1aSopenharmony_ci uxtl v25.8h, v18.8b // P4[0..7] 1070cabdff1aSopenharmony_ci mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] 1071cabdff1aSopenharmony_ci uxtl2 v26.8h, v6.16b // P3[8..15] 1072cabdff1aSopenharmony_ci mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] 1073cabdff1aSopenharmony_ci uxtl2 v7.8h, v7.16b // P7[8..15] 1074cabdff1aSopenharmony_ci ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] 1075cabdff1aSopenharmony_ci mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] 1076cabdff1aSopenharmony_ci uxtl2 v18.8h, v18.16b // P4[8..15] 1077cabdff1aSopenharmony_ci uxtl v23.8h, v20.8b // P8[0..7] 1078cabdff1aSopenharmony_ci mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] 1079cabdff1aSopenharmony_ci uxtl v24.8h, v1.8b // P5[0..7] 1080cabdff1aSopenharmony_ci uxtl2 v20.8h, v20.16b // P8[8..15] 1081cabdff1aSopenharmony_ci mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] 1082cabdff1aSopenharmony_ci uxtl2 v1.8h, v1.16b // P5[8..15] 1083cabdff1aSopenharmony_ci sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] 1084cabdff1aSopenharmony_ci mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] 1085cabdff1aSopenharmony_ci sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] 1086cabdff1aSopenharmony_ci mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] 1087cabdff1aSopenharmony_ci abs v27.8h, v26.8h 1088cabdff1aSopenharmony_ci sshr v26.8h, v26.8h, #8 // clip_sign[0..7] 1089cabdff1aSopenharmony_ci mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] 1090cabdff1aSopenharmony_ci abs v28.8h, v7.8h 1091cabdff1aSopenharmony_ci sshr v27.8h, v27.8h, #1 // clip[0..7] 1092cabdff1aSopenharmony_ci mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] 1093cabdff1aSopenharmony_ci sshr v7.8h, v7.8h, #8 // clip_sign[8..15] 1094cabdff1aSopenharmony_ci sshr v23.8h, v28.8h, #1 // clip[8..15] 1095cabdff1aSopenharmony_ci mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] 1096cabdff1aSopenharmony_ci cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 1097cabdff1aSopenharmony_ci srshr v17.8h, v17.8h, #3 1098cabdff1aSopenharmony_ci mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] 1099cabdff1aSopenharmony_ci cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 1100cabdff1aSopenharmony_ci srshr v16.8h, v16.8h, #3 1101cabdff1aSopenharmony_ci mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] 1102cabdff1aSopenharmony_ci abs v17.8h, v17.8h // a1[0..7] 1103cabdff1aSopenharmony_ci mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] 1104cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #3 1105cabdff1aSopenharmony_ci mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] 1106cabdff1aSopenharmony_ci abs v16.8h, v16.8h // a2[0..7] 1107cabdff1aSopenharmony_ci srshr v19.8h, v19.8h, #3 1108cabdff1aSopenharmony_ci mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] 1109cabdff1aSopenharmony_ci cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] 1110cabdff1aSopenharmony_ci abs v3.8h, v3.8h // a1[8..15] 1111cabdff1aSopenharmony_ci srshr v4.8h, v4.8h, #3 1112cabdff1aSopenharmony_ci abs v19.8h, v19.8h // a2[8..15] 1113cabdff1aSopenharmony_ci bsl v5.16b, v16.16b, v17.16b // a3[0..7] 1114cabdff1aSopenharmony_ci srshr v6.8h, v6.8h, #3 1115cabdff1aSopenharmony_ci cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] 1116cabdff1aSopenharmony_ci abs v17.8h, v4.8h // a0[0..7] 1117cabdff1aSopenharmony_ci sshr v4.8h, v4.8h, #8 // a0_sign[0..7] 1118cabdff1aSopenharmony_ci bsl v16.16b, v19.16b, v3.16b // a3[8..15] 1119cabdff1aSopenharmony_ci uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1120cabdff1aSopenharmony_ci abs v19.8h, v6.8h // a0[8..15] 1121cabdff1aSopenharmony_ci cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq 1122cabdff1aSopenharmony_ci cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] 1123cabdff1aSopenharmony_ci sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] 1124cabdff1aSopenharmony_ci sshr v6.8h, v6.8h, #8 // a0_sign[8..15] 1125cabdff1aSopenharmony_ci mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 1126cabdff1aSopenharmony_ci uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1127cabdff1aSopenharmony_ci orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq 1128cabdff1aSopenharmony_ci cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq 1129cabdff1aSopenharmony_ci cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] 1130cabdff1aSopenharmony_ci mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 1131cabdff1aSopenharmony_ci sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] 1132cabdff1aSopenharmony_ci orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] 1133cabdff1aSopenharmony_ci ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 1134cabdff1aSopenharmony_ci orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq 1135cabdff1aSopenharmony_ci cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either 1136cabdff1aSopenharmony_ci mov w0, v5.s[1] // move to gp reg 1137cabdff1aSopenharmony_ci cmhs v19.8h, v3.8h, v27.8h 1138cabdff1aSopenharmony_ci ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 1139cabdff1aSopenharmony_ci mov w2, v5.s[3] 1140cabdff1aSopenharmony_ci orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] 1141cabdff1aSopenharmony_ci orr v16.16b, v20.16b, v17.16b 1142cabdff1aSopenharmony_ci bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) 1143cabdff1aSopenharmony_ci cmtst v2.2d, v5.2d, v2.2d 1144cabdff1aSopenharmony_ci cmhs v3.8h, v0.8h, v23.8h 1145cabdff1aSopenharmony_ci mov w4, v5.s[1] 1146cabdff1aSopenharmony_ci mov w5, v5.s[3] 1147cabdff1aSopenharmony_ci and w0, w0, w2 1148cabdff1aSopenharmony_ci bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) 1149cabdff1aSopenharmony_ci orr v2.16b, v7.16b, v2.16b 1150cabdff1aSopenharmony_ci bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) 1151cabdff1aSopenharmony_ci mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] 1152cabdff1aSopenharmony_ci and w2, w4, w5 1153cabdff1aSopenharmony_ci bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) 1154cabdff1aSopenharmony_ci mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] 1155cabdff1aSopenharmony_ci and w0, w0, w2 1156cabdff1aSopenharmony_ci mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] 1157cabdff1aSopenharmony_ci sqxtun v2.8b, v25.8h 1158cabdff1aSopenharmony_ci tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case 1159cabdff1aSopenharmony_ci mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] 1160cabdff1aSopenharmony_ci sqxtun v0.8b, v24.8h 1161cabdff1aSopenharmony_ci sqxtun2 v2.16b, v18.8h 1162cabdff1aSopenharmony_ci sqxtun2 v0.16b, v1.8h 1163cabdff1aSopenharmony_ci st1 {v2.16b}, [x3], x1 1164cabdff1aSopenharmony_ci st1 {v0.16b}, [x3] 1165cabdff1aSopenharmony_ci1: ret 1166cabdff1aSopenharmony_ciendfunc 1167cabdff1aSopenharmony_ci 1168cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks 1169cabdff1aSopenharmony_ci// On entry: 1170cabdff1aSopenharmony_ci// x0 -> top-left pel of right block 1171cabdff1aSopenharmony_ci// x1 = row stride, bytes 1172cabdff1aSopenharmony_ci// w2 = PQUANT bitstream parameter 1173cabdff1aSopenharmony_cifunction ff_vc1_h_loop_filter16_neon, export=1 1174cabdff1aSopenharmony_ci sub x3, x0, #4 // where to start reading 1175cabdff1aSopenharmony_ci ldr d0, .Lcoeffs 1176cabdff1aSopenharmony_ci ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... 1177cabdff1aSopenharmony_ci sub x0, x0, #1 // where to start writing 1178cabdff1aSopenharmony_ci ld1 {v2.8b}, [x3], x1 1179cabdff1aSopenharmony_ci add x4, x0, x1, lsl #3 1180cabdff1aSopenharmony_ci ld1 {v3.8b}, [x3], x1 1181cabdff1aSopenharmony_ci add x5, x0, x1, lsl #2 1182cabdff1aSopenharmony_ci ld1 {v4.8b}, [x3], x1 1183cabdff1aSopenharmony_ci add x6, x4, x1, lsl #2 1184cabdff1aSopenharmony_ci ld1 {v5.8b}, [x3], x1 1185cabdff1aSopenharmony_ci ld1 {v6.8b}, [x3], x1 1186cabdff1aSopenharmony_ci ld1 {v7.8b}, [x3], x1 1187cabdff1aSopenharmony_ci trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... 1188cabdff1aSopenharmony_ci ld1 {v17.8b}, [x3], x1 1189cabdff1aSopenharmony_ci trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... 1190cabdff1aSopenharmony_ci ld1 {v2.8b}, [x3], x1 1191cabdff1aSopenharmony_ci trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... 1192cabdff1aSopenharmony_ci ld1 {v19.8b}, [x3], x1 1193cabdff1aSopenharmony_ci trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... 1194cabdff1aSopenharmony_ci ld1 {v4.8b}, [x3], x1 1195cabdff1aSopenharmony_ci trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... 1196cabdff1aSopenharmony_ci ld1 {v21.8b}, [x3], x1 1197cabdff1aSopenharmony_ci trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... 1198cabdff1aSopenharmony_ci ld1 {v6.8b}, [x3], x1 1199cabdff1aSopenharmony_ci trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... 1200cabdff1aSopenharmony_ci ld1 {v23.8b}, [x3], x1 1201cabdff1aSopenharmony_ci trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... 1202cabdff1aSopenharmony_ci ld1 {v17.8b}, [x3], x1 1203cabdff1aSopenharmony_ci trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... 1204cabdff1aSopenharmony_ci ld1 {v25.8b}, [x3] 1205cabdff1aSopenharmony_ci trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... 1206cabdff1aSopenharmony_ci trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... 1207cabdff1aSopenharmony_ci trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... 1208cabdff1aSopenharmony_ci trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... 1209cabdff1aSopenharmony_ci trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... 1210cabdff1aSopenharmony_ci trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... 1211cabdff1aSopenharmony_ci trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... 1212cabdff1aSopenharmony_ci trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... 1213cabdff1aSopenharmony_ci trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... 1214cabdff1aSopenharmony_ci trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... 1215cabdff1aSopenharmony_ci trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... 1216cabdff1aSopenharmony_ci trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... 1217cabdff1aSopenharmony_ci trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... 1218cabdff1aSopenharmony_ci trn1 v31.2s, v19.2s, v27.2s // P1[0..7] 1219cabdff1aSopenharmony_ci trn2 v19.2s, v19.2s, v27.2s // P5[0..7] 1220cabdff1aSopenharmony_ci trn1 v27.2s, v21.2s, v23.2s // P2[0..7] 1221cabdff1aSopenharmony_ci trn2 v21.2s, v21.2s, v23.2s // P6[0..7] 1222cabdff1aSopenharmony_ci trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... 1223cabdff1aSopenharmony_ci trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... 1224cabdff1aSopenharmony_ci trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... 1225cabdff1aSopenharmony_ci trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... 1226cabdff1aSopenharmony_ci trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... 1227cabdff1aSopenharmony_ci trn1 v24.2s, v29.2s, v23.2s // P1[8..15] 1228cabdff1aSopenharmony_ci trn2 v23.2s, v29.2s, v23.2s // P5[8..15] 1229cabdff1aSopenharmony_ci trn1 v26.2s, v25.2s, v18.2s // P2[8..15] 1230cabdff1aSopenharmony_ci trn2 v18.2s, v25.2s, v18.2s // P6[8..15] 1231cabdff1aSopenharmony_ci trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... 1232cabdff1aSopenharmony_ci trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... 1233cabdff1aSopenharmony_ci trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... 1234cabdff1aSopenharmony_ci trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... 1235cabdff1aSopenharmony_ci trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... 1236cabdff1aSopenharmony_ci ushll v5.8h, v31.8b, #1 // 2*P1[0..7] 1237cabdff1aSopenharmony_ci ushll v6.8h, v19.8b, #1 // 2*P5[0..7] 1238cabdff1aSopenharmony_ci trn1 v7.2s, v16.2s, v20.2s // P3[0..7] 1239cabdff1aSopenharmony_ci uxtl v17.8h, v27.8b // P2[0..7] 1240cabdff1aSopenharmony_ci trn2 v16.2s, v16.2s, v20.2s // P7[0..7] 1241cabdff1aSopenharmony_ci uxtl v20.8h, v21.8b // P6[0..7] 1242cabdff1aSopenharmony_ci trn1 v21.2s, v22.2s, v25.2s // P3[8..15] 1243cabdff1aSopenharmony_ci ushll v24.8h, v24.8b, #1 // 2*P1[8..15] 1244cabdff1aSopenharmony_ci trn2 v22.2s, v22.2s, v25.2s // P7[8..15] 1245cabdff1aSopenharmony_ci ushll v25.8h, v23.8b, #1 // 2*P5[8..15] 1246cabdff1aSopenharmony_ci trn1 v27.2s, v1.2s, v3.2s // P4[0..7] 1247cabdff1aSopenharmony_ci uxtl v26.8h, v26.8b // P2[8..15] 1248cabdff1aSopenharmony_ci mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] 1249cabdff1aSopenharmony_ci uxtl v17.8h, v18.8b // P6[8..15] 1250cabdff1aSopenharmony_ci mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] 1251cabdff1aSopenharmony_ci trn1 v18.2s, v2.2s, v4.2s // P4[8..15] 1252cabdff1aSopenharmony_ci uxtl v28.8h, v7.8b // P3[0..7] 1253cabdff1aSopenharmony_ci mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] 1254cabdff1aSopenharmony_ci uxtl v16.8h, v16.8b // P7[0..7] 1255cabdff1aSopenharmony_ci uxtl v26.8h, v21.8b // P3[8..15] 1256cabdff1aSopenharmony_ci mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] 1257cabdff1aSopenharmony_ci uxtl v22.8h, v22.8b // P7[8..15] 1258cabdff1aSopenharmony_ci ushll v7.8h, v7.8b, #1 // 2*P3[0..7] 1259cabdff1aSopenharmony_ci uxtl v27.8h, v27.8b // P4[0..7] 1260cabdff1aSopenharmony_ci trn2 v1.2s, v1.2s, v3.2s // P8[0..7] 1261cabdff1aSopenharmony_ci ushll v3.8h, v21.8b, #1 // 2*P3[8..15] 1262cabdff1aSopenharmony_ci trn2 v2.2s, v2.2s, v4.2s // P8[8..15] 1263cabdff1aSopenharmony_ci uxtl v4.8h, v18.8b // P4[8..15] 1264cabdff1aSopenharmony_ci mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] 1265cabdff1aSopenharmony_ci uxtl v1.8h, v1.8b // P8[0..7] 1266cabdff1aSopenharmony_ci mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] 1267cabdff1aSopenharmony_ci uxtl v2.8h, v2.8b // P8[8..15] 1268cabdff1aSopenharmony_ci uxtl v16.8h, v19.8b // P5[0..7] 1269cabdff1aSopenharmony_ci mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] 1270cabdff1aSopenharmony_ci uxtl v18.8h, v23.8b // P5[8..15] 1271cabdff1aSopenharmony_ci dup v19.8h, w2 // pq 1272cabdff1aSopenharmony_ci mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] 1273cabdff1aSopenharmony_ci sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] 1274cabdff1aSopenharmony_ci sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] 1275cabdff1aSopenharmony_ci mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] 1276cabdff1aSopenharmony_ci abs v23.8h, v21.8h 1277cabdff1aSopenharmony_ci mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] 1278cabdff1aSopenharmony_ci abs v26.8h, v22.8h 1279cabdff1aSopenharmony_ci sshr v21.8h, v21.8h, #8 // clip_sign[0..7] 1280cabdff1aSopenharmony_ci mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] 1281cabdff1aSopenharmony_ci sshr v23.8h, v23.8h, #1 // clip[0..7] 1282cabdff1aSopenharmony_ci sshr v26.8h, v26.8h, #1 // clip[8..15] 1283cabdff1aSopenharmony_ci mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] 1284cabdff1aSopenharmony_ci sshr v1.8h, v22.8h, #8 // clip_sign[8..15] 1285cabdff1aSopenharmony_ci cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 1286cabdff1aSopenharmony_ci mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] 1287cabdff1aSopenharmony_ci cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 1288cabdff1aSopenharmony_ci srshr v5.8h, v5.8h, #3 1289cabdff1aSopenharmony_ci mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] 1290cabdff1aSopenharmony_ci srshr v2.8h, v6.8h, #3 1291cabdff1aSopenharmony_ci mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] 1292cabdff1aSopenharmony_ci srshr v6.8h, v24.8h, #3 1293cabdff1aSopenharmony_ci mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] 1294cabdff1aSopenharmony_ci abs v5.8h, v5.8h // a1[0..7] 1295cabdff1aSopenharmony_ci srshr v24.8h, v25.8h, #3 1296cabdff1aSopenharmony_ci mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] 1297cabdff1aSopenharmony_ci abs v2.8h, v2.8h // a2[0..7] 1298cabdff1aSopenharmony_ci abs v6.8h, v6.8h // a1[8..15] 1299cabdff1aSopenharmony_ci mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] 1300cabdff1aSopenharmony_ci abs v17.8h, v24.8h // a2[8..15] 1301cabdff1aSopenharmony_ci cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] 1302cabdff1aSopenharmony_ci srshr v3.8h, v3.8h, #3 1303cabdff1aSopenharmony_ci cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] 1304cabdff1aSopenharmony_ci srshr v7.8h, v7.8h, #3 1305cabdff1aSopenharmony_ci bsl v20.16b, v2.16b, v5.16b // a3[0..7] 1306cabdff1aSopenharmony_ci abs v2.8h, v3.8h // a0[8..15] 1307cabdff1aSopenharmony_ci sshr v3.8h, v3.8h, #8 // a0_sign[8..15] 1308cabdff1aSopenharmony_ci bsl v24.16b, v17.16b, v6.16b // a3[8..15] 1309cabdff1aSopenharmony_ci abs v5.8h, v7.8h // a0[0..7] 1310cabdff1aSopenharmony_ci sshr v6.8h, v7.8h, #8 // a0_sign[0..7] 1311cabdff1aSopenharmony_ci cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq 1312cabdff1aSopenharmony_ci sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] 1313cabdff1aSopenharmony_ci uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1314cabdff1aSopenharmony_ci cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] 1315cabdff1aSopenharmony_ci uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1316cabdff1aSopenharmony_ci cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq 1317cabdff1aSopenharmony_ci orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq 1318cabdff1aSopenharmony_ci sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] 1319cabdff1aSopenharmony_ci mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 1320cabdff1aSopenharmony_ci cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] 1321cabdff1aSopenharmony_ci orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq 1322cabdff1aSopenharmony_ci mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 1323cabdff1aSopenharmony_ci orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] 1324cabdff1aSopenharmony_ci orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] 1325cabdff1aSopenharmony_ci ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 1326cabdff1aSopenharmony_ci mov w7, v2.s[1] 1327cabdff1aSopenharmony_ci mov w8, v2.s[3] 1328cabdff1aSopenharmony_ci ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 1329cabdff1aSopenharmony_ci mov w2, v5.s[1] // move to gp reg 1330cabdff1aSopenharmony_ci cmhs v2.8h, v3.8h, v26.8h 1331cabdff1aSopenharmony_ci mov w3, v5.s[3] 1332cabdff1aSopenharmony_ci cmhs v5.8h, v0.8h, v23.8h 1333cabdff1aSopenharmony_ci bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) 1334cabdff1aSopenharmony_ci and w9, w7, w8 1335cabdff1aSopenharmony_ci bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) 1336cabdff1aSopenharmony_ci and w10, w2, w3 1337cabdff1aSopenharmony_ci bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) 1338cabdff1aSopenharmony_ci and w9, w10, w9 1339cabdff1aSopenharmony_ci bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) 1340cabdff1aSopenharmony_ci mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 1341cabdff1aSopenharmony_ci tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case 1342cabdff1aSopenharmony_ci mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 1343cabdff1aSopenharmony_ci mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 1344cabdff1aSopenharmony_ci sqxtun v2.8b, v4.8h 1345cabdff1aSopenharmony_ci mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 1346cabdff1aSopenharmony_ci sqxtun v0.8b, v27.8h 1347cabdff1aSopenharmony_ci sqxtun v1.8b, v16.8h 1348cabdff1aSopenharmony_ci sqxtun v3.8b, v18.8h 1349cabdff1aSopenharmony_ci tbnz w2, #0, 1f 1350cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[0], [x0], x1 1351cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[1], [x0], x1 1352cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[2], [x0], x1 1353cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[3], [x0] 1354cabdff1aSopenharmony_ci1: tbnz w3, #0, 2f 1355cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[4], [x5], x1 1356cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[5], [x5], x1 1357cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[6], [x5], x1 1358cabdff1aSopenharmony_ci st2 {v0.b, v1.b}[7], [x5] 1359cabdff1aSopenharmony_ci2: tbnz w7, #0, 3f 1360cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[0], [x4], x1 1361cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[1], [x4], x1 1362cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[2], [x4], x1 1363cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[3], [x4] 1364cabdff1aSopenharmony_ci3: tbnz w8, #0, 4f 1365cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[4], [x6], x1 1366cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[5], [x6], x1 1367cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[6], [x6], x1 1368cabdff1aSopenharmony_ci st2 {v2.b, v3.b}[7], [x6] 1369cabdff1aSopenharmony_ci4: ret 1370cabdff1aSopenharmony_ciendfunc 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci// Copy at most the specified number of bytes from source to destination buffer, 1373cabdff1aSopenharmony_ci// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence 1374cabdff1aSopenharmony_ci// On entry: 1375cabdff1aSopenharmony_ci// x0 -> source buffer 1376cabdff1aSopenharmony_ci// w1 = max number of bytes to copy 1377cabdff1aSopenharmony_ci// x2 -> destination buffer, optimally 8-byte aligned 1378cabdff1aSopenharmony_ci// On exit: 1379cabdff1aSopenharmony_ci// w0 = number of bytes not copied 1380cabdff1aSopenharmony_cifunction ff_vc1_unescape_buffer_helper_neon, export=1 1381cabdff1aSopenharmony_ci // Offset by 80 to screen out cases that are too short for us to handle, 1382cabdff1aSopenharmony_ci // and also make it easy to test for loop termination, or to determine 1383cabdff1aSopenharmony_ci // whether we need an odd number of half-iterations of the loop. 1384cabdff1aSopenharmony_ci subs w1, w1, #80 1385cabdff1aSopenharmony_ci b.mi 90f 1386cabdff1aSopenharmony_ci 1387cabdff1aSopenharmony_ci // Set up useful constants 1388cabdff1aSopenharmony_ci movi v20.4s, #3, lsl #24 1389cabdff1aSopenharmony_ci movi v21.4s, #3, lsl #16 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_ci tst w1, #32 1392cabdff1aSopenharmony_ci b.ne 1f 1393cabdff1aSopenharmony_ci 1394cabdff1aSopenharmony_ci ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 1395cabdff1aSopenharmony_ci ext v25.16b, v0.16b, v1.16b, #1 1396cabdff1aSopenharmony_ci ext v26.16b, v0.16b, v1.16b, #2 1397cabdff1aSopenharmony_ci ext v27.16b, v0.16b, v1.16b, #3 1398cabdff1aSopenharmony_ci ext v29.16b, v1.16b, v2.16b, #1 1399cabdff1aSopenharmony_ci ext v30.16b, v1.16b, v2.16b, #2 1400cabdff1aSopenharmony_ci ext v31.16b, v1.16b, v2.16b, #3 1401cabdff1aSopenharmony_ci bic v24.16b, v0.16b, v20.16b 1402cabdff1aSopenharmony_ci bic v25.16b, v25.16b, v20.16b 1403cabdff1aSopenharmony_ci bic v26.16b, v26.16b, v20.16b 1404cabdff1aSopenharmony_ci bic v27.16b, v27.16b, v20.16b 1405cabdff1aSopenharmony_ci bic v28.16b, v1.16b, v20.16b 1406cabdff1aSopenharmony_ci bic v29.16b, v29.16b, v20.16b 1407cabdff1aSopenharmony_ci bic v30.16b, v30.16b, v20.16b 1408cabdff1aSopenharmony_ci bic v31.16b, v31.16b, v20.16b 1409cabdff1aSopenharmony_ci eor v24.16b, v24.16b, v21.16b 1410cabdff1aSopenharmony_ci eor v25.16b, v25.16b, v21.16b 1411cabdff1aSopenharmony_ci eor v26.16b, v26.16b, v21.16b 1412cabdff1aSopenharmony_ci eor v27.16b, v27.16b, v21.16b 1413cabdff1aSopenharmony_ci eor v28.16b, v28.16b, v21.16b 1414cabdff1aSopenharmony_ci eor v29.16b, v29.16b, v21.16b 1415cabdff1aSopenharmony_ci eor v30.16b, v30.16b, v21.16b 1416cabdff1aSopenharmony_ci eor v31.16b, v31.16b, v21.16b 1417cabdff1aSopenharmony_ci cmeq v24.4s, v24.4s, #0 1418cabdff1aSopenharmony_ci cmeq v25.4s, v25.4s, #0 1419cabdff1aSopenharmony_ci cmeq v26.4s, v26.4s, #0 1420cabdff1aSopenharmony_ci cmeq v27.4s, v27.4s, #0 1421cabdff1aSopenharmony_ci add w1, w1, #32 1422cabdff1aSopenharmony_ci b 3f 1423cabdff1aSopenharmony_ci 1424cabdff1aSopenharmony_ci1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 1425cabdff1aSopenharmony_ci ext v25.16b, v3.16b, v4.16b, #1 1426cabdff1aSopenharmony_ci ext v26.16b, v3.16b, v4.16b, #2 1427cabdff1aSopenharmony_ci ext v27.16b, v3.16b, v4.16b, #3 1428cabdff1aSopenharmony_ci ext v29.16b, v4.16b, v5.16b, #1 1429cabdff1aSopenharmony_ci ext v30.16b, v4.16b, v5.16b, #2 1430cabdff1aSopenharmony_ci ext v31.16b, v4.16b, v5.16b, #3 1431cabdff1aSopenharmony_ci bic v24.16b, v3.16b, v20.16b 1432cabdff1aSopenharmony_ci bic v25.16b, v25.16b, v20.16b 1433cabdff1aSopenharmony_ci bic v26.16b, v26.16b, v20.16b 1434cabdff1aSopenharmony_ci bic v27.16b, v27.16b, v20.16b 1435cabdff1aSopenharmony_ci bic v28.16b, v4.16b, v20.16b 1436cabdff1aSopenharmony_ci bic v29.16b, v29.16b, v20.16b 1437cabdff1aSopenharmony_ci bic v30.16b, v30.16b, v20.16b 1438cabdff1aSopenharmony_ci bic v31.16b, v31.16b, v20.16b 1439cabdff1aSopenharmony_ci eor v24.16b, v24.16b, v21.16b 1440cabdff1aSopenharmony_ci eor v25.16b, v25.16b, v21.16b 1441cabdff1aSopenharmony_ci eor v26.16b, v26.16b, v21.16b 1442cabdff1aSopenharmony_ci eor v27.16b, v27.16b, v21.16b 1443cabdff1aSopenharmony_ci eor v28.16b, v28.16b, v21.16b 1444cabdff1aSopenharmony_ci eor v29.16b, v29.16b, v21.16b 1445cabdff1aSopenharmony_ci eor v30.16b, v30.16b, v21.16b 1446cabdff1aSopenharmony_ci eor v31.16b, v31.16b, v21.16b 1447cabdff1aSopenharmony_ci cmeq v24.4s, v24.4s, #0 1448cabdff1aSopenharmony_ci cmeq v25.4s, v25.4s, #0 1449cabdff1aSopenharmony_ci cmeq v26.4s, v26.4s, #0 1450cabdff1aSopenharmony_ci cmeq v27.4s, v27.4s, #0 1451cabdff1aSopenharmony_ci // Drop through... 1452cabdff1aSopenharmony_ci2: mov v0.16b, v5.16b 1453cabdff1aSopenharmony_ci ld1 {v1.16b, v2.16b}, [x0], #32 1454cabdff1aSopenharmony_ci cmeq v28.4s, v28.4s, #0 1455cabdff1aSopenharmony_ci cmeq v29.4s, v29.4s, #0 1456cabdff1aSopenharmony_ci cmeq v30.4s, v30.4s, #0 1457cabdff1aSopenharmony_ci cmeq v31.4s, v31.4s, #0 1458cabdff1aSopenharmony_ci orr v24.16b, v24.16b, v25.16b 1459cabdff1aSopenharmony_ci orr v26.16b, v26.16b, v27.16b 1460cabdff1aSopenharmony_ci orr v28.16b, v28.16b, v29.16b 1461cabdff1aSopenharmony_ci orr v30.16b, v30.16b, v31.16b 1462cabdff1aSopenharmony_ci ext v25.16b, v0.16b, v1.16b, #1 1463cabdff1aSopenharmony_ci orr v22.16b, v24.16b, v26.16b 1464cabdff1aSopenharmony_ci ext v26.16b, v0.16b, v1.16b, #2 1465cabdff1aSopenharmony_ci ext v27.16b, v0.16b, v1.16b, #3 1466cabdff1aSopenharmony_ci ext v29.16b, v1.16b, v2.16b, #1 1467cabdff1aSopenharmony_ci orr v23.16b, v28.16b, v30.16b 1468cabdff1aSopenharmony_ci ext v30.16b, v1.16b, v2.16b, #2 1469cabdff1aSopenharmony_ci ext v31.16b, v1.16b, v2.16b, #3 1470cabdff1aSopenharmony_ci bic v24.16b, v0.16b, v20.16b 1471cabdff1aSopenharmony_ci bic v25.16b, v25.16b, v20.16b 1472cabdff1aSopenharmony_ci bic v26.16b, v26.16b, v20.16b 1473cabdff1aSopenharmony_ci orr v22.16b, v22.16b, v23.16b 1474cabdff1aSopenharmony_ci bic v27.16b, v27.16b, v20.16b 1475cabdff1aSopenharmony_ci bic v28.16b, v1.16b, v20.16b 1476cabdff1aSopenharmony_ci bic v29.16b, v29.16b, v20.16b 1477cabdff1aSopenharmony_ci bic v30.16b, v30.16b, v20.16b 1478cabdff1aSopenharmony_ci bic v31.16b, v31.16b, v20.16b 1479cabdff1aSopenharmony_ci addv s22, v22.4s 1480cabdff1aSopenharmony_ci eor v24.16b, v24.16b, v21.16b 1481cabdff1aSopenharmony_ci eor v25.16b, v25.16b, v21.16b 1482cabdff1aSopenharmony_ci eor v26.16b, v26.16b, v21.16b 1483cabdff1aSopenharmony_ci eor v27.16b, v27.16b, v21.16b 1484cabdff1aSopenharmony_ci eor v28.16b, v28.16b, v21.16b 1485cabdff1aSopenharmony_ci mov w3, v22.s[0] 1486cabdff1aSopenharmony_ci eor v29.16b, v29.16b, v21.16b 1487cabdff1aSopenharmony_ci eor v30.16b, v30.16b, v21.16b 1488cabdff1aSopenharmony_ci eor v31.16b, v31.16b, v21.16b 1489cabdff1aSopenharmony_ci cmeq v24.4s, v24.4s, #0 1490cabdff1aSopenharmony_ci cmeq v25.4s, v25.4s, #0 1491cabdff1aSopenharmony_ci cmeq v26.4s, v26.4s, #0 1492cabdff1aSopenharmony_ci cmeq v27.4s, v27.4s, #0 1493cabdff1aSopenharmony_ci cbnz w3, 90f 1494cabdff1aSopenharmony_ci st1 {v3.16b, v4.16b}, [x2], #32 1495cabdff1aSopenharmony_ci3: mov v3.16b, v2.16b 1496cabdff1aSopenharmony_ci ld1 {v4.16b, v5.16b}, [x0], #32 1497cabdff1aSopenharmony_ci cmeq v28.4s, v28.4s, #0 1498cabdff1aSopenharmony_ci cmeq v29.4s, v29.4s, #0 1499cabdff1aSopenharmony_ci cmeq v30.4s, v30.4s, #0 1500cabdff1aSopenharmony_ci cmeq v31.4s, v31.4s, #0 1501cabdff1aSopenharmony_ci orr v24.16b, v24.16b, v25.16b 1502cabdff1aSopenharmony_ci orr v26.16b, v26.16b, v27.16b 1503cabdff1aSopenharmony_ci orr v28.16b, v28.16b, v29.16b 1504cabdff1aSopenharmony_ci orr v30.16b, v30.16b, v31.16b 1505cabdff1aSopenharmony_ci ext v25.16b, v3.16b, v4.16b, #1 1506cabdff1aSopenharmony_ci orr v22.16b, v24.16b, v26.16b 1507cabdff1aSopenharmony_ci ext v26.16b, v3.16b, v4.16b, #2 1508cabdff1aSopenharmony_ci ext v27.16b, v3.16b, v4.16b, #3 1509cabdff1aSopenharmony_ci ext v29.16b, v4.16b, v5.16b, #1 1510cabdff1aSopenharmony_ci orr v23.16b, v28.16b, v30.16b 1511cabdff1aSopenharmony_ci ext v30.16b, v4.16b, v5.16b, #2 1512cabdff1aSopenharmony_ci ext v31.16b, v4.16b, v5.16b, #3 1513cabdff1aSopenharmony_ci bic v24.16b, v3.16b, v20.16b 1514cabdff1aSopenharmony_ci bic v25.16b, v25.16b, v20.16b 1515cabdff1aSopenharmony_ci bic v26.16b, v26.16b, v20.16b 1516cabdff1aSopenharmony_ci orr v22.16b, v22.16b, v23.16b 1517cabdff1aSopenharmony_ci bic v27.16b, v27.16b, v20.16b 1518cabdff1aSopenharmony_ci bic v28.16b, v4.16b, v20.16b 1519cabdff1aSopenharmony_ci bic v29.16b, v29.16b, v20.16b 1520cabdff1aSopenharmony_ci bic v30.16b, v30.16b, v20.16b 1521cabdff1aSopenharmony_ci bic v31.16b, v31.16b, v20.16b 1522cabdff1aSopenharmony_ci addv s22, v22.4s 1523cabdff1aSopenharmony_ci eor v24.16b, v24.16b, v21.16b 1524cabdff1aSopenharmony_ci eor v25.16b, v25.16b, v21.16b 1525cabdff1aSopenharmony_ci eor v26.16b, v26.16b, v21.16b 1526cabdff1aSopenharmony_ci eor v27.16b, v27.16b, v21.16b 1527cabdff1aSopenharmony_ci eor v28.16b, v28.16b, v21.16b 1528cabdff1aSopenharmony_ci mov w3, v22.s[0] 1529cabdff1aSopenharmony_ci eor v29.16b, v29.16b, v21.16b 1530cabdff1aSopenharmony_ci eor v30.16b, v30.16b, v21.16b 1531cabdff1aSopenharmony_ci eor v31.16b, v31.16b, v21.16b 1532cabdff1aSopenharmony_ci cmeq v24.4s, v24.4s, #0 1533cabdff1aSopenharmony_ci cmeq v25.4s, v25.4s, #0 1534cabdff1aSopenharmony_ci cmeq v26.4s, v26.4s, #0 1535cabdff1aSopenharmony_ci cmeq v27.4s, v27.4s, #0 1536cabdff1aSopenharmony_ci cbnz w3, 91f 1537cabdff1aSopenharmony_ci st1 {v0.16b, v1.16b}, [x2], #32 1538cabdff1aSopenharmony_ci subs w1, w1, #64 1539cabdff1aSopenharmony_ci b.pl 2b 1540cabdff1aSopenharmony_ci 1541cabdff1aSopenharmony_ci90: add w0, w1, #80 1542cabdff1aSopenharmony_ci ret 1543cabdff1aSopenharmony_ci 1544cabdff1aSopenharmony_ci91: sub w1, w1, #32 1545cabdff1aSopenharmony_ci b 90b 1546cabdff1aSopenharmony_ciendfunc 1547