1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * IDCT AArch64 NEON optimisations 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci// Clamp 16-bit signed block coefficients to unsigned 8-bit 26cabdff1aSopenharmony_ci// On entry: 27cabdff1aSopenharmony_ci// x0 -> array of 64x 16-bit coefficients 28cabdff1aSopenharmony_ci// x1 -> 8-bit results 29cabdff1aSopenharmony_ci// x2 = row stride for results, bytes 30cabdff1aSopenharmony_cifunction ff_put_pixels_clamped_neon, export=1 31cabdff1aSopenharmony_ci ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 32cabdff1aSopenharmony_ci ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] 33cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 34cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 35cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 36cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 37cabdff1aSopenharmony_ci sqxtun v4.8b, v4.8h 38cabdff1aSopenharmony_ci st1 {v0.8b}, [x1], x2 39cabdff1aSopenharmony_ci sqxtun v0.8b, v5.8h 40cabdff1aSopenharmony_ci st1 {v1.8b}, [x1], x2 41cabdff1aSopenharmony_ci sqxtun v1.8b, v6.8h 42cabdff1aSopenharmony_ci st1 {v2.8b}, [x1], x2 43cabdff1aSopenharmony_ci sqxtun v2.8b, v7.8h 44cabdff1aSopenharmony_ci st1 {v3.8b}, [x1], x2 45cabdff1aSopenharmony_ci st1 {v4.8b}, [x1], x2 46cabdff1aSopenharmony_ci st1 {v0.8b}, [x1], x2 47cabdff1aSopenharmony_ci st1 {v1.8b}, [x1], x2 48cabdff1aSopenharmony_ci st1 {v2.8b}, [x1] 49cabdff1aSopenharmony_ci ret 50cabdff1aSopenharmony_ciendfunc 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) 53cabdff1aSopenharmony_ci// On entry: 54cabdff1aSopenharmony_ci// x0 -> array of 64x 16-bit coefficients 55cabdff1aSopenharmony_ci// x1 -> 8-bit results 56cabdff1aSopenharmony_ci// x2 = row stride for results, bytes 57cabdff1aSopenharmony_cifunction ff_put_signed_pixels_clamped_neon, export=1 58cabdff1aSopenharmony_ci ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 59cabdff1aSopenharmony_ci movi v4.8b, #128 60cabdff1aSopenharmony_ci ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] 61cabdff1aSopenharmony_ci sqxtn v0.8b, v0.8h 62cabdff1aSopenharmony_ci sqxtn v1.8b, v1.8h 63cabdff1aSopenharmony_ci sqxtn v2.8b, v2.8h 64cabdff1aSopenharmony_ci sqxtn v3.8b, v3.8h 65cabdff1aSopenharmony_ci sqxtn v5.8b, v16.8h 66cabdff1aSopenharmony_ci add v0.8b, v0.8b, v4.8b 67cabdff1aSopenharmony_ci sqxtn v6.8b, v17.8h 68cabdff1aSopenharmony_ci add v1.8b, v1.8b, v4.8b 69cabdff1aSopenharmony_ci sqxtn v7.8b, v18.8h 70cabdff1aSopenharmony_ci add v2.8b, v2.8b, v4.8b 71cabdff1aSopenharmony_ci sqxtn v16.8b, v19.8h 72cabdff1aSopenharmony_ci add v3.8b, v3.8b, v4.8b 73cabdff1aSopenharmony_ci st1 {v0.8b}, [x1], x2 74cabdff1aSopenharmony_ci add v0.8b, v5.8b, v4.8b 75cabdff1aSopenharmony_ci st1 {v1.8b}, [x1], x2 76cabdff1aSopenharmony_ci add v1.8b, v6.8b, v4.8b 77cabdff1aSopenharmony_ci st1 {v2.8b}, [x1], x2 78cabdff1aSopenharmony_ci add v2.8b, v7.8b, v4.8b 79cabdff1aSopenharmony_ci st1 {v3.8b}, [x1], x2 80cabdff1aSopenharmony_ci add v3.8b, v16.8b, v4.8b 81cabdff1aSopenharmony_ci st1 {v0.8b}, [x1], x2 82cabdff1aSopenharmony_ci st1 {v1.8b}, [x1], x2 83cabdff1aSopenharmony_ci st1 {v2.8b}, [x1], x2 84cabdff1aSopenharmony_ci st1 {v3.8b}, [x1] 85cabdff1aSopenharmony_ci ret 86cabdff1aSopenharmony_ciendfunc 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci// Add 16-bit signed block coefficients to unsigned 8-bit 89cabdff1aSopenharmony_ci// On entry: 90cabdff1aSopenharmony_ci// x0 -> array of 64x 16-bit coefficients 91cabdff1aSopenharmony_ci// x1 -> 8-bit input and results 92cabdff1aSopenharmony_ci// x2 = row stride for 8-bit input and results, bytes 93cabdff1aSopenharmony_cifunction ff_add_pixels_clamped_neon, export=1 94cabdff1aSopenharmony_ci ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 95cabdff1aSopenharmony_ci mov x3, x1 96cabdff1aSopenharmony_ci ld1 {v4.8b}, [x1], x2 97cabdff1aSopenharmony_ci ld1 {v5.8b}, [x1], x2 98cabdff1aSopenharmony_ci ld1 {v6.8b}, [x1], x2 99cabdff1aSopenharmony_ci ld1 {v7.8b}, [x1], x2 100cabdff1aSopenharmony_ci ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] 101cabdff1aSopenharmony_ci uaddw v0.8h, v0.8h, v4.8b 102cabdff1aSopenharmony_ci uaddw v1.8h, v1.8h, v5.8b 103cabdff1aSopenharmony_ci uaddw v2.8h, v2.8h, v6.8b 104cabdff1aSopenharmony_ci ld1 {v4.8b}, [x1], x2 105cabdff1aSopenharmony_ci uaddw v3.8h, v3.8h, v7.8b 106cabdff1aSopenharmony_ci ld1 {v5.8b}, [x1], x2 107cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 108cabdff1aSopenharmony_ci ld1 {v6.8b}, [x1], x2 109cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 110cabdff1aSopenharmony_ci ld1 {v7.8b}, [x1] 111cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 112cabdff1aSopenharmony_ci sqxtun v3.8b, v3.8h 113cabdff1aSopenharmony_ci uaddw v4.8h, v16.8h, v4.8b 114cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x2 115cabdff1aSopenharmony_ci uaddw v0.8h, v17.8h, v5.8b 116cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x2 117cabdff1aSopenharmony_ci uaddw v1.8h, v18.8h, v6.8b 118cabdff1aSopenharmony_ci st1 {v2.8b}, [x3], x2 119cabdff1aSopenharmony_ci uaddw v2.8h, v19.8h, v7.8b 120cabdff1aSopenharmony_ci sqxtun v4.8b, v4.8h 121cabdff1aSopenharmony_ci sqxtun v0.8b, v0.8h 122cabdff1aSopenharmony_ci st1 {v3.8b}, [x3], x2 123cabdff1aSopenharmony_ci sqxtun v1.8b, v1.8h 124cabdff1aSopenharmony_ci sqxtun v2.8b, v2.8h 125cabdff1aSopenharmony_ci st1 {v4.8b}, [x3], x2 126cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x2 127cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x2 128cabdff1aSopenharmony_ci st1 {v2.8b}, [x3] 129cabdff1aSopenharmony_ci ret 130cabdff1aSopenharmony_ciendfunc 131