1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3cabdff1aSopenharmony_ci * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4cabdff1aSopenharmony_ci * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 24cabdff1aSopenharmony_ci#include "neon.S" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci.macro h264_loop_filter_start 27cabdff1aSopenharmony_ci cmp w2, #0 28cabdff1aSopenharmony_ci ldr w6, [x4] 29cabdff1aSopenharmony_ci ccmp w3, #0, #0, ne 30cabdff1aSopenharmony_ci mov v24.S[0], w6 31cabdff1aSopenharmony_ci and w8, w6, w6, lsl #16 32cabdff1aSopenharmony_ci b.eq 1f 33cabdff1aSopenharmony_ci ands w8, w8, w8, lsl #8 34cabdff1aSopenharmony_ci b.ge 2f 35cabdff1aSopenharmony_ci1: 36cabdff1aSopenharmony_ci ret 37cabdff1aSopenharmony_ci2: 38cabdff1aSopenharmony_ci.endm 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci.macro h264_loop_filter_luma 41cabdff1aSopenharmony_ci dup v22.16B, w2 // alpha 42cabdff1aSopenharmony_ci uxtl v24.8H, v24.8B 43cabdff1aSopenharmony_ci uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) 44cabdff1aSopenharmony_ci uxtl v24.4S, v24.4H 45cabdff1aSopenharmony_ci uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) 46cabdff1aSopenharmony_ci sli v24.8H, v24.8H, #8 47cabdff1aSopenharmony_ci uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) 48cabdff1aSopenharmony_ci sli v24.4S, v24.4S, #16 49cabdff1aSopenharmony_ci cmhi v21.16B, v22.16B, v21.16B // < alpha 50cabdff1aSopenharmony_ci dup v22.16B, w3 // beta 51cabdff1aSopenharmony_ci cmlt v23.16B, v24.16B, #0 52cabdff1aSopenharmony_ci cmhi v28.16B, v22.16B, v28.16B // < beta 53cabdff1aSopenharmony_ci cmhi v30.16B, v22.16B, v30.16B // < beta 54cabdff1aSopenharmony_ci bic v21.16B, v21.16B, v23.16B 55cabdff1aSopenharmony_ci uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) 56cabdff1aSopenharmony_ci and v21.16B, v21.16B, v28.16B 57cabdff1aSopenharmony_ci uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) 58cabdff1aSopenharmony_ci and v21.16B, v21.16B, v30.16B // < beta 59cabdff1aSopenharmony_ci shrn v30.8b, v21.8h, #4 60cabdff1aSopenharmony_ci mov x7, v30.d[0] 61cabdff1aSopenharmony_ci cmhi v17.16B, v22.16B, v17.16B // < beta 62cabdff1aSopenharmony_ci cmhi v19.16B, v22.16B, v19.16B // < beta 63cabdff1aSopenharmony_ci cbz x7, 9f 64cabdff1aSopenharmony_ci and v17.16B, v17.16B, v21.16B 65cabdff1aSopenharmony_ci and v19.16B, v19.16B, v21.16B 66cabdff1aSopenharmony_ci and v24.16B, v24.16B, v21.16B 67cabdff1aSopenharmony_ci urhadd v28.16B, v16.16B, v0.16B 68cabdff1aSopenharmony_ci sub v21.16B, v24.16B, v17.16B 69cabdff1aSopenharmony_ci uqadd v23.16B, v18.16B, v24.16B 70cabdff1aSopenharmony_ci uhadd v20.16B, v20.16B, v28.16B 71cabdff1aSopenharmony_ci sub v21.16B, v21.16B, v19.16B 72cabdff1aSopenharmony_ci uhadd v28.16B, v4.16B, v28.16B 73cabdff1aSopenharmony_ci umin v23.16B, v23.16B, v20.16B 74cabdff1aSopenharmony_ci uqsub v22.16B, v18.16B, v24.16B 75cabdff1aSopenharmony_ci uqadd v4.16B, v2.16B, v24.16B 76cabdff1aSopenharmony_ci umax v23.16B, v23.16B, v22.16B 77cabdff1aSopenharmony_ci uqsub v22.16B, v2.16B, v24.16B 78cabdff1aSopenharmony_ci umin v28.16B, v4.16B, v28.16B 79cabdff1aSopenharmony_ci uxtl v4.8H, v0.8B 80cabdff1aSopenharmony_ci umax v28.16B, v28.16B, v22.16B 81cabdff1aSopenharmony_ci uxtl2 v20.8H, v0.16B 82cabdff1aSopenharmony_ci usubw v4.8H, v4.8H, v16.8B 83cabdff1aSopenharmony_ci usubw2 v20.8H, v20.8H, v16.16B 84cabdff1aSopenharmony_ci shl v4.8H, v4.8H, #2 85cabdff1aSopenharmony_ci shl v20.8H, v20.8H, #2 86cabdff1aSopenharmony_ci uaddw v4.8H, v4.8H, v18.8B 87cabdff1aSopenharmony_ci uaddw2 v20.8H, v20.8H, v18.16B 88cabdff1aSopenharmony_ci usubw v4.8H, v4.8H, v2.8B 89cabdff1aSopenharmony_ci usubw2 v20.8H, v20.8H, v2.16B 90cabdff1aSopenharmony_ci rshrn v4.8B, v4.8H, #3 91cabdff1aSopenharmony_ci rshrn2 v4.16B, v20.8H, #3 92cabdff1aSopenharmony_ci bsl v17.16B, v23.16B, v18.16B 93cabdff1aSopenharmony_ci bsl v19.16B, v28.16B, v2.16B 94cabdff1aSopenharmony_ci neg v23.16B, v21.16B 95cabdff1aSopenharmony_ci uxtl v28.8H, v16.8B 96cabdff1aSopenharmony_ci smin v4.16B, v4.16B, v21.16B 97cabdff1aSopenharmony_ci uxtl2 v21.8H, v16.16B 98cabdff1aSopenharmony_ci smax v4.16B, v4.16B, v23.16B 99cabdff1aSopenharmony_ci uxtl v22.8H, v0.8B 100cabdff1aSopenharmony_ci uxtl2 v24.8H, v0.16B 101cabdff1aSopenharmony_ci saddw v28.8H, v28.8H, v4.8B 102cabdff1aSopenharmony_ci saddw2 v21.8H, v21.8H, v4.16B 103cabdff1aSopenharmony_ci ssubw v22.8H, v22.8H, v4.8B 104cabdff1aSopenharmony_ci ssubw2 v24.8H, v24.8H, v4.16B 105cabdff1aSopenharmony_ci sqxtun v16.8B, v28.8H 106cabdff1aSopenharmony_ci sqxtun2 v16.16B, v21.8H 107cabdff1aSopenharmony_ci sqxtun v0.8B, v22.8H 108cabdff1aSopenharmony_ci sqxtun2 v0.16B, v24.8H 109cabdff1aSopenharmony_ci.endm 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_luma_neon, export=1 112cabdff1aSopenharmony_ci h264_loop_filter_start 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci ld1 {v0.16B}, [x0], x1 115cabdff1aSopenharmony_ci ld1 {v2.16B}, [x0], x1 116cabdff1aSopenharmony_ci ld1 {v4.16B}, [x0], x1 117cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 118cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 119cabdff1aSopenharmony_ci ld1 {v20.16B}, [x0], x1 120cabdff1aSopenharmony_ci ld1 {v18.16B}, [x0], x1 121cabdff1aSopenharmony_ci ld1 {v16.16B}, [x0], x1 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci h264_loop_filter_luma 124cabdff1aSopenharmony_ci 125cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 126cabdff1aSopenharmony_ci st1 {v17.16B}, [x0], x1 127cabdff1aSopenharmony_ci st1 {v16.16B}, [x0], x1 128cabdff1aSopenharmony_ci st1 {v0.16B}, [x0], x1 129cabdff1aSopenharmony_ci st1 {v19.16B}, [x0] 130cabdff1aSopenharmony_ci9: 131cabdff1aSopenharmony_ci ret 132cabdff1aSopenharmony_ciendfunc 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_luma_neon, export=1 135cabdff1aSopenharmony_ci h264_loop_filter_start 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci sub x0, x0, #4 138cabdff1aSopenharmony_ci ld1 {v6.8B}, [x0], x1 139cabdff1aSopenharmony_ci ld1 {v20.8B}, [x0], x1 140cabdff1aSopenharmony_ci ld1 {v18.8B}, [x0], x1 141cabdff1aSopenharmony_ci ld1 {v16.8B}, [x0], x1 142cabdff1aSopenharmony_ci ld1 {v0.8B}, [x0], x1 143cabdff1aSopenharmony_ci ld1 {v2.8B}, [x0], x1 144cabdff1aSopenharmony_ci ld1 {v4.8B}, [x0], x1 145cabdff1aSopenharmony_ci ld1 {v26.8B}, [x0], x1 146cabdff1aSopenharmony_ci ld1 {v6.D}[1], [x0], x1 147cabdff1aSopenharmony_ci ld1 {v20.D}[1], [x0], x1 148cabdff1aSopenharmony_ci ld1 {v18.D}[1], [x0], x1 149cabdff1aSopenharmony_ci ld1 {v16.D}[1], [x0], x1 150cabdff1aSopenharmony_ci ld1 {v0.D}[1], [x0], x1 151cabdff1aSopenharmony_ci ld1 {v2.D}[1], [x0], x1 152cabdff1aSopenharmony_ci ld1 {v4.D}[1], [x0], x1 153cabdff1aSopenharmony_ci ld1 {v26.D}[1], [x0], x1 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci h264_loop_filter_luma 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #4 162cabdff1aSopenharmony_ci add x0, x0, #2 163cabdff1aSopenharmony_ci st1 {v17.S}[0], [x0], x1 164cabdff1aSopenharmony_ci st1 {v16.S}[0], [x0], x1 165cabdff1aSopenharmony_ci st1 {v0.S}[0], [x0], x1 166cabdff1aSopenharmony_ci st1 {v19.S}[0], [x0], x1 167cabdff1aSopenharmony_ci st1 {v17.S}[1], [x0], x1 168cabdff1aSopenharmony_ci st1 {v16.S}[1], [x0], x1 169cabdff1aSopenharmony_ci st1 {v0.S}[1], [x0], x1 170cabdff1aSopenharmony_ci st1 {v19.S}[1], [x0], x1 171cabdff1aSopenharmony_ci st1 {v17.S}[2], [x0], x1 172cabdff1aSopenharmony_ci st1 {v16.S}[2], [x0], x1 173cabdff1aSopenharmony_ci st1 {v0.S}[2], [x0], x1 174cabdff1aSopenharmony_ci st1 {v19.S}[2], [x0], x1 175cabdff1aSopenharmony_ci st1 {v17.S}[3], [x0], x1 176cabdff1aSopenharmony_ci st1 {v16.S}[3], [x0], x1 177cabdff1aSopenharmony_ci st1 {v0.S}[3], [x0], x1 178cabdff1aSopenharmony_ci st1 {v19.S}[3], [x0], x1 179cabdff1aSopenharmony_ci9: 180cabdff1aSopenharmony_ci ret 181cabdff1aSopenharmony_ciendfunc 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci.macro h264_loop_filter_start_intra 185cabdff1aSopenharmony_ci orr w4, w2, w3 186cabdff1aSopenharmony_ci cbnz w4, 1f 187cabdff1aSopenharmony_ci ret 188cabdff1aSopenharmony_ci1: 189cabdff1aSopenharmony_ci dup v30.16b, w2 // alpha 190cabdff1aSopenharmony_ci dup v31.16b, w3 // beta 191cabdff1aSopenharmony_ci.endm 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci.macro h264_loop_filter_luma_intra 194cabdff1aSopenharmony_ci uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) 195cabdff1aSopenharmony_ci uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) 196cabdff1aSopenharmony_ci uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) 197cabdff1aSopenharmony_ci cmhi v19.16b, v30.16b, v16.16b // < alpha 198cabdff1aSopenharmony_ci cmhi v17.16b, v31.16b, v17.16b // < beta 199cabdff1aSopenharmony_ci cmhi v18.16b, v31.16b, v18.16b // < beta 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_ci movi v29.16b, #2 202cabdff1aSopenharmony_ci ushr v30.16b, v30.16b, #2 // alpha >> 2 203cabdff1aSopenharmony_ci add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 204cabdff1aSopenharmony_ci cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ci and v19.16b, v19.16b, v17.16b 207cabdff1aSopenharmony_ci and v19.16b, v19.16b, v18.16b 208cabdff1aSopenharmony_ci shrn v20.8b, v19.8h, #4 209cabdff1aSopenharmony_ci mov x4, v20.d[0] 210cabdff1aSopenharmony_ci cbz x4, 9f 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci ushll v20.8h, v6.8b, #1 213cabdff1aSopenharmony_ci ushll v22.8h, v1.8b, #1 214cabdff1aSopenharmony_ci ushll2 v21.8h, v6.16b, #1 215cabdff1aSopenharmony_ci ushll2 v23.8h, v1.16b, #1 216cabdff1aSopenharmony_ci uaddw v20.8h, v20.8h, v7.8b 217cabdff1aSopenharmony_ci uaddw v22.8h, v22.8h, v0.8b 218cabdff1aSopenharmony_ci uaddw2 v21.8h, v21.8h, v7.16b 219cabdff1aSopenharmony_ci uaddw2 v23.8h, v23.8h, v0.16b 220cabdff1aSopenharmony_ci uaddw v20.8h, v20.8h, v1.8b 221cabdff1aSopenharmony_ci uaddw v22.8h, v22.8h, v6.8b 222cabdff1aSopenharmony_ci uaddw2 v21.8h, v21.8h, v1.16b 223cabdff1aSopenharmony_ci uaddw2 v23.8h, v23.8h, v6.16b 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_ci rshrn v24.8b, v20.8h, #2 // p0'_1 226cabdff1aSopenharmony_ci rshrn v25.8b, v22.8h, #2 // q0'_1 227cabdff1aSopenharmony_ci rshrn2 v24.16b, v21.8h, #2 // p0'_1 228cabdff1aSopenharmony_ci rshrn2 v25.16b, v23.8h, #2 // q0'_1 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) 231cabdff1aSopenharmony_ci uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) 232cabdff1aSopenharmony_ci cmhi v17.16b, v31.16b, v17.16b // < beta 233cabdff1aSopenharmony_ci cmhi v18.16b, v31.16b, v18.16b // < beta 234cabdff1aSopenharmony_ci 235cabdff1aSopenharmony_ci and v17.16b, v16.16b, v17.16b // if_2 && if_3 236cabdff1aSopenharmony_ci and v18.16b, v16.16b, v18.16b // if_2 && if_4 237cabdff1aSopenharmony_ci 238cabdff1aSopenharmony_ci not v30.16b, v17.16b 239cabdff1aSopenharmony_ci not v31.16b, v18.16b 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) 242cabdff1aSopenharmony_ci and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 245cabdff1aSopenharmony_ci and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 248cabdff1aSopenharmony_ci uaddl v26.8h, v5.8b, v7.8b 249cabdff1aSopenharmony_ci uaddl2 v27.8h, v5.16b, v7.16b 250cabdff1aSopenharmony_ci uaddw v26.8h, v26.8h, v0.8b 251cabdff1aSopenharmony_ci uaddw2 v27.8h, v27.8h, v0.16b 252cabdff1aSopenharmony_ci add v20.8h, v20.8h, v26.8h 253cabdff1aSopenharmony_ci add v21.8h, v21.8h, v27.8h 254cabdff1aSopenharmony_ci uaddw v20.8h, v20.8h, v0.8b 255cabdff1aSopenharmony_ci uaddw2 v21.8h, v21.8h, v0.16b 256cabdff1aSopenharmony_ci rshrn v20.8b, v20.8h, #3 // p0'_2 257cabdff1aSopenharmony_ci rshrn2 v20.16b, v21.8h, #3 // p0'_2 258cabdff1aSopenharmony_ci uaddw v26.8h, v26.8h, v6.8b 259cabdff1aSopenharmony_ci uaddw2 v27.8h, v27.8h, v6.16b 260cabdff1aSopenharmony_ci rshrn v21.8b, v26.8h, #2 // p1'_2 261cabdff1aSopenharmony_ci rshrn2 v21.16b, v27.8h, #2 // p1'_2 262cabdff1aSopenharmony_ci uaddl v28.8h, v4.8b, v5.8b 263cabdff1aSopenharmony_ci uaddl2 v29.8h, v4.16b, v5.16b 264cabdff1aSopenharmony_ci shl v28.8h, v28.8h, #1 265cabdff1aSopenharmony_ci shl v29.8h, v29.8h, #1 266cabdff1aSopenharmony_ci add v28.8h, v28.8h, v26.8h 267cabdff1aSopenharmony_ci add v29.8h, v29.8h, v27.8h 268cabdff1aSopenharmony_ci rshrn v19.8b, v28.8h, #3 // p2'_2 269cabdff1aSopenharmony_ci rshrn2 v19.16b, v29.8h, #3 // p2'_2 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_ci //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 272cabdff1aSopenharmony_ci uaddl v26.8h, v2.8b, v0.8b 273cabdff1aSopenharmony_ci uaddl2 v27.8h, v2.16b, v0.16b 274cabdff1aSopenharmony_ci uaddw v26.8h, v26.8h, v7.8b 275cabdff1aSopenharmony_ci uaddw2 v27.8h, v27.8h, v7.16b 276cabdff1aSopenharmony_ci add v22.8h, v22.8h, v26.8h 277cabdff1aSopenharmony_ci add v23.8h, v23.8h, v27.8h 278cabdff1aSopenharmony_ci uaddw v22.8h, v22.8h, v7.8b 279cabdff1aSopenharmony_ci uaddw2 v23.8h, v23.8h, v7.16b 280cabdff1aSopenharmony_ci rshrn v22.8b, v22.8h, #3 // q0'_2 281cabdff1aSopenharmony_ci rshrn2 v22.16b, v23.8h, #3 // q0'_2 282cabdff1aSopenharmony_ci uaddw v26.8h, v26.8h, v1.8b 283cabdff1aSopenharmony_ci uaddw2 v27.8h, v27.8h, v1.16b 284cabdff1aSopenharmony_ci rshrn v23.8b, v26.8h, #2 // q1'_2 285cabdff1aSopenharmony_ci rshrn2 v23.16b, v27.8h, #2 // q1'_2 286cabdff1aSopenharmony_ci uaddl v28.8h, v2.8b, v3.8b 287cabdff1aSopenharmony_ci uaddl2 v29.8h, v2.16b, v3.16b 288cabdff1aSopenharmony_ci shl v28.8h, v28.8h, #1 289cabdff1aSopenharmony_ci shl v29.8h, v29.8h, #1 290cabdff1aSopenharmony_ci add v28.8h, v28.8h, v26.8h 291cabdff1aSopenharmony_ci add v29.8h, v29.8h, v27.8h 292cabdff1aSopenharmony_ci rshrn v26.8b, v28.8h, #3 // q2'_2 293cabdff1aSopenharmony_ci rshrn2 v26.16b, v29.8h, #3 // q2'_2 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci bit v7.16b, v24.16b, v30.16b // p0'_1 296cabdff1aSopenharmony_ci bit v0.16b, v25.16b, v31.16b // q0'_1 297cabdff1aSopenharmony_ci bit v7.16b, v20.16b, v17.16b // p0'_2 298cabdff1aSopenharmony_ci bit v6.16b, v21.16b, v17.16b // p1'_2 299cabdff1aSopenharmony_ci bit v5.16b, v19.16b, v17.16b // p2'_2 300cabdff1aSopenharmony_ci bit v0.16b, v22.16b, v18.16b // q0'_2 301cabdff1aSopenharmony_ci bit v1.16b, v23.16b, v18.16b // q1'_2 302cabdff1aSopenharmony_ci bit v2.16b, v26.16b, v18.16b // q2'_2 303cabdff1aSopenharmony_ci.endm 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_luma_intra_neon, export=1 306cabdff1aSopenharmony_ci h264_loop_filter_start_intra 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_ci ld1 {v0.16b}, [x0], x1 // q0 309cabdff1aSopenharmony_ci ld1 {v1.16b}, [x0], x1 // q1 310cabdff1aSopenharmony_ci ld1 {v2.16b}, [x0], x1 // q2 311cabdff1aSopenharmony_ci ld1 {v3.16b}, [x0], x1 // q3 312cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 313cabdff1aSopenharmony_ci ld1 {v4.16b}, [x0], x1 // p3 314cabdff1aSopenharmony_ci ld1 {v5.16b}, [x0], x1 // p2 315cabdff1aSopenharmony_ci ld1 {v6.16b}, [x0], x1 // p1 316cabdff1aSopenharmony_ci ld1 {v7.16b}, [x0] // p0 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci h264_loop_filter_luma_intra 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 321cabdff1aSopenharmony_ci st1 {v5.16b}, [x0], x1 // p2 322cabdff1aSopenharmony_ci st1 {v6.16b}, [x0], x1 // p1 323cabdff1aSopenharmony_ci st1 {v7.16b}, [x0], x1 // p0 324cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 // q0 325cabdff1aSopenharmony_ci st1 {v1.16b}, [x0], x1 // q1 326cabdff1aSopenharmony_ci st1 {v2.16b}, [x0] // q2 327cabdff1aSopenharmony_ci9: 328cabdff1aSopenharmony_ci ret 329cabdff1aSopenharmony_ciendfunc 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_luma_intra_neon, export=1 332cabdff1aSopenharmony_ci h264_loop_filter_start_intra 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ci sub x0, x0, #4 335cabdff1aSopenharmony_ci ld1 {v4.8b}, [x0], x1 336cabdff1aSopenharmony_ci ld1 {v5.8b}, [x0], x1 337cabdff1aSopenharmony_ci ld1 {v6.8b}, [x0], x1 338cabdff1aSopenharmony_ci ld1 {v7.8b}, [x0], x1 339cabdff1aSopenharmony_ci ld1 {v0.8b}, [x0], x1 340cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x1 341cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x1 342cabdff1aSopenharmony_ci ld1 {v3.8b}, [x0], x1 343cabdff1aSopenharmony_ci ld1 {v4.d}[1], [x0], x1 344cabdff1aSopenharmony_ci ld1 {v5.d}[1], [x0], x1 345cabdff1aSopenharmony_ci ld1 {v6.d}[1], [x0], x1 346cabdff1aSopenharmony_ci ld1 {v7.d}[1], [x0], x1 347cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x0], x1 348cabdff1aSopenharmony_ci ld1 {v1.d}[1], [x0], x1 349cabdff1aSopenharmony_ci ld1 {v2.d}[1], [x0], x1 350cabdff1aSopenharmony_ci ld1 {v3.d}[1], [x0], x1 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci h264_loop_filter_luma_intra 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #4 359cabdff1aSopenharmony_ci st1 {v4.8b}, [x0], x1 360cabdff1aSopenharmony_ci st1 {v5.8b}, [x0], x1 361cabdff1aSopenharmony_ci st1 {v6.8b}, [x0], x1 362cabdff1aSopenharmony_ci st1 {v7.8b}, [x0], x1 363cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 364cabdff1aSopenharmony_ci st1 {v1.8b}, [x0], x1 365cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 366cabdff1aSopenharmony_ci st1 {v3.8b}, [x0], x1 367cabdff1aSopenharmony_ci st1 {v4.d}[1], [x0], x1 368cabdff1aSopenharmony_ci st1 {v5.d}[1], [x0], x1 369cabdff1aSopenharmony_ci st1 {v6.d}[1], [x0], x1 370cabdff1aSopenharmony_ci st1 {v7.d}[1], [x0], x1 371cabdff1aSopenharmony_ci st1 {v0.d}[1], [x0], x1 372cabdff1aSopenharmony_ci st1 {v1.d}[1], [x0], x1 373cabdff1aSopenharmony_ci st1 {v2.d}[1], [x0], x1 374cabdff1aSopenharmony_ci st1 {v3.d}[1], [x0], x1 375cabdff1aSopenharmony_ci9: 376cabdff1aSopenharmony_ci ret 377cabdff1aSopenharmony_ciendfunc 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci.macro h264_loop_filter_chroma 380cabdff1aSopenharmony_ci dup v22.8B, w2 // alpha 381cabdff1aSopenharmony_ci dup v23.8B, w3 // beta 382cabdff1aSopenharmony_ci uxtl v24.8H, v24.8B 383cabdff1aSopenharmony_ci uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) 384cabdff1aSopenharmony_ci uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) 385cabdff1aSopenharmony_ci uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) 386cabdff1aSopenharmony_ci cmhi v26.8B, v22.8B, v26.8B // < alpha 387cabdff1aSopenharmony_ci cmhi v28.8B, v23.8B, v28.8B // < beta 388cabdff1aSopenharmony_ci cmhi v30.8B, v23.8B, v30.8B // < beta 389cabdff1aSopenharmony_ci uxtl v4.8H, v0.8B 390cabdff1aSopenharmony_ci and v26.8B, v26.8B, v28.8B 391cabdff1aSopenharmony_ci usubw v4.8H, v4.8H, v16.8B 392cabdff1aSopenharmony_ci and v26.8B, v26.8B, v30.8B 393cabdff1aSopenharmony_ci shl v4.8H, v4.8H, #2 394cabdff1aSopenharmony_ci mov x8, v26.d[0] 395cabdff1aSopenharmony_ci sli v24.8H, v24.8H, #8 396cabdff1aSopenharmony_ci uaddw v4.8H, v4.8H, v18.8B 397cabdff1aSopenharmony_ci cbz x8, 9f 398cabdff1aSopenharmony_ci usubw v4.8H, v4.8H, v2.8B 399cabdff1aSopenharmony_ci rshrn v4.8B, v4.8H, #3 400cabdff1aSopenharmony_ci smin v4.8B, v4.8B, v24.8B 401cabdff1aSopenharmony_ci neg v25.8B, v24.8B 402cabdff1aSopenharmony_ci smax v4.8B, v4.8B, v25.8B 403cabdff1aSopenharmony_ci uxtl v22.8H, v0.8B 404cabdff1aSopenharmony_ci and v4.8B, v4.8B, v26.8B 405cabdff1aSopenharmony_ci uxtl v28.8H, v16.8B 406cabdff1aSopenharmony_ci saddw v28.8H, v28.8H, v4.8B 407cabdff1aSopenharmony_ci ssubw v22.8H, v22.8H, v4.8B 408cabdff1aSopenharmony_ci sqxtun v16.8B, v28.8H 409cabdff1aSopenharmony_ci sqxtun v0.8B, v22.8H 410cabdff1aSopenharmony_ci.endm 411cabdff1aSopenharmony_ci 412cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_neon, export=1 413cabdff1aSopenharmony_ci h264_loop_filter_start 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 416cabdff1aSopenharmony_ci ld1 {v18.8B}, [x0], x1 417cabdff1aSopenharmony_ci ld1 {v16.8B}, [x0], x1 418cabdff1aSopenharmony_ci ld1 {v0.8B}, [x0], x1 419cabdff1aSopenharmony_ci ld1 {v2.8B}, [x0] 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci h264_loop_filter_chroma 422cabdff1aSopenharmony_ci 423cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 424cabdff1aSopenharmony_ci st1 {v16.8B}, [x0], x1 425cabdff1aSopenharmony_ci st1 {v0.8B}, [x0], x1 426cabdff1aSopenharmony_ci9: 427cabdff1aSopenharmony_ci ret 428cabdff1aSopenharmony_ciendfunc 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_neon, export=1 431cabdff1aSopenharmony_ci h264_loop_filter_start 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci sub x0, x0, #2 434cabdff1aSopenharmony_cih_loop_filter_chroma420: 435cabdff1aSopenharmony_ci ld1 {v18.S}[0], [x0], x1 436cabdff1aSopenharmony_ci ld1 {v16.S}[0], [x0], x1 437cabdff1aSopenharmony_ci ld1 {v0.S}[0], [x0], x1 438cabdff1aSopenharmony_ci ld1 {v2.S}[0], [x0], x1 439cabdff1aSopenharmony_ci ld1 {v18.S}[1], [x0], x1 440cabdff1aSopenharmony_ci ld1 {v16.S}[1], [x0], x1 441cabdff1aSopenharmony_ci ld1 {v0.S}[1], [x0], x1 442cabdff1aSopenharmony_ci ld1 {v2.S}[1], [x0], x1 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci h264_loop_filter_chroma 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 449cabdff1aSopenharmony_ci 450cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #3 451cabdff1aSopenharmony_ci st1 {v18.S}[0], [x0], x1 452cabdff1aSopenharmony_ci st1 {v16.S}[0], [x0], x1 453cabdff1aSopenharmony_ci st1 {v0.S}[0], [x0], x1 454cabdff1aSopenharmony_ci st1 {v2.S}[0], [x0], x1 455cabdff1aSopenharmony_ci st1 {v18.S}[1], [x0], x1 456cabdff1aSopenharmony_ci st1 {v16.S}[1], [x0], x1 457cabdff1aSopenharmony_ci st1 {v0.S}[1], [x0], x1 458cabdff1aSopenharmony_ci st1 {v2.S}[1], [x0], x1 459cabdff1aSopenharmony_ci9: 460cabdff1aSopenharmony_ci ret 461cabdff1aSopenharmony_ciendfunc 462cabdff1aSopenharmony_ci 463cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_neon, export=1 464cabdff1aSopenharmony_ci h264_loop_filter_start 465cabdff1aSopenharmony_ci add x5, x0, x1 466cabdff1aSopenharmony_ci sub x0, x0, #2 467cabdff1aSopenharmony_ci add x1, x1, x1 468cabdff1aSopenharmony_ci mov x7, x30 469cabdff1aSopenharmony_ci bl h_loop_filter_chroma420 470cabdff1aSopenharmony_ci mov x30, x7 471cabdff1aSopenharmony_ci sub x0, x5, #2 472cabdff1aSopenharmony_ci mov v24.s[0], w6 473cabdff1aSopenharmony_ci b h_loop_filter_chroma420 474cabdff1aSopenharmony_ciendfunc 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci.macro h264_loop_filter_chroma_intra 477cabdff1aSopenharmony_ci uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) 478cabdff1aSopenharmony_ci uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) 479cabdff1aSopenharmony_ci uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) 480cabdff1aSopenharmony_ci cmhi v26.8b, v30.8b, v26.8b // < alpha 481cabdff1aSopenharmony_ci cmhi v27.8b, v31.8b, v27.8b // < beta 482cabdff1aSopenharmony_ci cmhi v28.8b, v31.8b, v28.8b // < beta 483cabdff1aSopenharmony_ci and v26.8b, v26.8b, v27.8b 484cabdff1aSopenharmony_ci and v26.8b, v26.8b, v28.8b 485cabdff1aSopenharmony_ci mov x2, v26.d[0] 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci ushll v4.8h, v18.8b, #1 488cabdff1aSopenharmony_ci ushll v6.8h, v19.8b, #1 489cabdff1aSopenharmony_ci cbz x2, 9f 490cabdff1aSopenharmony_ci uaddl v20.8h, v16.8b, v19.8b 491cabdff1aSopenharmony_ci uaddl v22.8h, v17.8b, v18.8b 492cabdff1aSopenharmony_ci add v20.8h, v20.8h, v4.8h 493cabdff1aSopenharmony_ci add v22.8h, v22.8h, v6.8h 494cabdff1aSopenharmony_ci uqrshrn v24.8b, v20.8h, #2 495cabdff1aSopenharmony_ci uqrshrn v25.8b, v22.8h, #2 496cabdff1aSopenharmony_ci bit v16.8b, v24.8b, v26.8b 497cabdff1aSopenharmony_ci bit v17.8b, v25.8b, v26.8b 498cabdff1aSopenharmony_ci.endm 499cabdff1aSopenharmony_ci 500cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_intra_neon, export=1 501cabdff1aSopenharmony_ci h264_loop_filter_start_intra 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 504cabdff1aSopenharmony_ci ld1 {v18.8b}, [x0], x1 505cabdff1aSopenharmony_ci ld1 {v16.8b}, [x0], x1 506cabdff1aSopenharmony_ci ld1 {v17.8b}, [x0], x1 507cabdff1aSopenharmony_ci ld1 {v19.8b}, [x0] 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci h264_loop_filter_chroma_intra 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 512cabdff1aSopenharmony_ci st1 {v16.8b}, [x0], x1 513cabdff1aSopenharmony_ci st1 {v17.8b}, [x0], x1 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci9: 516cabdff1aSopenharmony_ci ret 517cabdff1aSopenharmony_ciendfunc 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 520cabdff1aSopenharmony_ci h264_loop_filter_start_intra 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci sub x4, x0, #2 523cabdff1aSopenharmony_ci sub x0, x0, #1 524cabdff1aSopenharmony_ci ld1 {v18.8b}, [x4], x1 525cabdff1aSopenharmony_ci ld1 {v16.8b}, [x4], x1 526cabdff1aSopenharmony_ci ld1 {v17.8b}, [x4], x1 527cabdff1aSopenharmony_ci ld1 {v19.8b}, [x4], x1 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_ci h264_loop_filter_chroma_intra 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[0], [x0], x1 534cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[1], [x0], x1 535cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[2], [x0], x1 536cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[3], [x0], x1 537cabdff1aSopenharmony_ci 538cabdff1aSopenharmony_ci9: 539cabdff1aSopenharmony_ci ret 540cabdff1aSopenharmony_ciendfunc 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_intra_neon, export=1 543cabdff1aSopenharmony_ci h264_loop_filter_start_intra 544cabdff1aSopenharmony_ci 545cabdff1aSopenharmony_ci sub x4, x0, #2 546cabdff1aSopenharmony_ci sub x0, x0, #1 547cabdff1aSopenharmony_cih_loop_filter_chroma420_intra: 548cabdff1aSopenharmony_ci ld1 {v18.8b}, [x4], x1 549cabdff1aSopenharmony_ci ld1 {v16.8b}, [x4], x1 550cabdff1aSopenharmony_ci ld1 {v17.8b}, [x4], x1 551cabdff1aSopenharmony_ci ld1 {v19.8b}, [x4], x1 552cabdff1aSopenharmony_ci ld1 {v18.s}[1], [x4], x1 553cabdff1aSopenharmony_ci ld1 {v16.s}[1], [x4], x1 554cabdff1aSopenharmony_ci ld1 {v17.s}[1], [x4], x1 555cabdff1aSopenharmony_ci ld1 {v19.s}[1], [x4], x1 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci h264_loop_filter_chroma_intra 560cabdff1aSopenharmony_ci 561cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[0], [x0], x1 562cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[1], [x0], x1 563cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[2], [x0], x1 564cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[3], [x0], x1 565cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[4], [x0], x1 566cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[5], [x0], x1 567cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[6], [x0], x1 568cabdff1aSopenharmony_ci st2 {v16.b,v17.b}[7], [x0], x1 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci9: 571cabdff1aSopenharmony_ci ret 572cabdff1aSopenharmony_ciendfunc 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_intra_neon, export=1 575cabdff1aSopenharmony_ci h264_loop_filter_start_intra 576cabdff1aSopenharmony_ci sub x4, x0, #2 577cabdff1aSopenharmony_ci add x5, x0, x1, lsl #3 578cabdff1aSopenharmony_ci sub x0, x0, #1 579cabdff1aSopenharmony_ci mov x7, x30 580cabdff1aSopenharmony_ci bl h_loop_filter_chroma420_intra 581cabdff1aSopenharmony_ci sub x0, x5, #1 582cabdff1aSopenharmony_ci mov x30, x7 583cabdff1aSopenharmony_ci b h_loop_filter_chroma420_intra 584cabdff1aSopenharmony_ciendfunc 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ci.macro biweight_16 macs, macd 587cabdff1aSopenharmony_ci dup v0.16B, w5 588cabdff1aSopenharmony_ci dup v1.16B, w6 589cabdff1aSopenharmony_ci mov v4.16B, v16.16B 590cabdff1aSopenharmony_ci mov v6.16B, v16.16B 591cabdff1aSopenharmony_ci1: subs w3, w3, #2 592cabdff1aSopenharmony_ci ld1 {v20.16B}, [x0], x2 593cabdff1aSopenharmony_ci \macd v4.8H, v0.8B, v20.8B 594cabdff1aSopenharmony_ci \macd\()2 v6.8H, v0.16B, v20.16B 595cabdff1aSopenharmony_ci ld1 {v22.16B}, [x1], x2 596cabdff1aSopenharmony_ci \macs v4.8H, v1.8B, v22.8B 597cabdff1aSopenharmony_ci \macs\()2 v6.8H, v1.16B, v22.16B 598cabdff1aSopenharmony_ci mov v24.16B, v16.16B 599cabdff1aSopenharmony_ci ld1 {v28.16B}, [x0], x2 600cabdff1aSopenharmony_ci mov v26.16B, v16.16B 601cabdff1aSopenharmony_ci \macd v24.8H, v0.8B, v28.8B 602cabdff1aSopenharmony_ci \macd\()2 v26.8H, v0.16B, v28.16B 603cabdff1aSopenharmony_ci ld1 {v30.16B}, [x1], x2 604cabdff1aSopenharmony_ci \macs v24.8H, v1.8B, v30.8B 605cabdff1aSopenharmony_ci \macs\()2 v26.8H, v1.16B, v30.16B 606cabdff1aSopenharmony_ci sshl v4.8H, v4.8H, v18.8H 607cabdff1aSopenharmony_ci sshl v6.8H, v6.8H, v18.8H 608cabdff1aSopenharmony_ci sqxtun v4.8B, v4.8H 609cabdff1aSopenharmony_ci sqxtun2 v4.16B, v6.8H 610cabdff1aSopenharmony_ci sshl v24.8H, v24.8H, v18.8H 611cabdff1aSopenharmony_ci sshl v26.8H, v26.8H, v18.8H 612cabdff1aSopenharmony_ci sqxtun v24.8B, v24.8H 613cabdff1aSopenharmony_ci sqxtun2 v24.16B, v26.8H 614cabdff1aSopenharmony_ci mov v6.16B, v16.16B 615cabdff1aSopenharmony_ci st1 {v4.16B}, [x7], x2 616cabdff1aSopenharmony_ci mov v4.16B, v16.16B 617cabdff1aSopenharmony_ci st1 {v24.16B}, [x7], x2 618cabdff1aSopenharmony_ci b.ne 1b 619cabdff1aSopenharmony_ci ret 620cabdff1aSopenharmony_ci.endm 621cabdff1aSopenharmony_ci 622cabdff1aSopenharmony_ci.macro biweight_8 macs, macd 623cabdff1aSopenharmony_ci dup v0.8B, w5 624cabdff1aSopenharmony_ci dup v1.8B, w6 625cabdff1aSopenharmony_ci mov v2.16B, v16.16B 626cabdff1aSopenharmony_ci mov v20.16B, v16.16B 627cabdff1aSopenharmony_ci1: subs w3, w3, #2 628cabdff1aSopenharmony_ci ld1 {v4.8B}, [x0], x2 629cabdff1aSopenharmony_ci \macd v2.8H, v0.8B, v4.8B 630cabdff1aSopenharmony_ci ld1 {v5.8B}, [x1], x2 631cabdff1aSopenharmony_ci \macs v2.8H, v1.8B, v5.8B 632cabdff1aSopenharmony_ci ld1 {v6.8B}, [x0], x2 633cabdff1aSopenharmony_ci \macd v20.8H, v0.8B, v6.8B 634cabdff1aSopenharmony_ci ld1 {v7.8B}, [x1], x2 635cabdff1aSopenharmony_ci \macs v20.8H, v1.8B, v7.8B 636cabdff1aSopenharmony_ci sshl v2.8H, v2.8H, v18.8H 637cabdff1aSopenharmony_ci sqxtun v2.8B, v2.8H 638cabdff1aSopenharmony_ci sshl v20.8H, v20.8H, v18.8H 639cabdff1aSopenharmony_ci sqxtun v4.8B, v20.8H 640cabdff1aSopenharmony_ci mov v20.16B, v16.16B 641cabdff1aSopenharmony_ci st1 {v2.8B}, [x7], x2 642cabdff1aSopenharmony_ci mov v2.16B, v16.16B 643cabdff1aSopenharmony_ci st1 {v4.8B}, [x7], x2 644cabdff1aSopenharmony_ci b.ne 1b 645cabdff1aSopenharmony_ci ret 646cabdff1aSopenharmony_ci.endm 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_ci.macro biweight_4 macs, macd 649cabdff1aSopenharmony_ci dup v0.8B, w5 650cabdff1aSopenharmony_ci dup v1.8B, w6 651cabdff1aSopenharmony_ci mov v2.16B, v16.16B 652cabdff1aSopenharmony_ci mov v20.16B,v16.16B 653cabdff1aSopenharmony_ci1: subs w3, w3, #4 654cabdff1aSopenharmony_ci ld1 {v4.S}[0], [x0], x2 655cabdff1aSopenharmony_ci ld1 {v4.S}[1], [x0], x2 656cabdff1aSopenharmony_ci \macd v2.8H, v0.8B, v4.8B 657cabdff1aSopenharmony_ci ld1 {v5.S}[0], [x1], x2 658cabdff1aSopenharmony_ci ld1 {v5.S}[1], [x1], x2 659cabdff1aSopenharmony_ci \macs v2.8H, v1.8B, v5.8B 660cabdff1aSopenharmony_ci b.lt 2f 661cabdff1aSopenharmony_ci ld1 {v6.S}[0], [x0], x2 662cabdff1aSopenharmony_ci ld1 {v6.S}[1], [x0], x2 663cabdff1aSopenharmony_ci \macd v20.8H, v0.8B, v6.8B 664cabdff1aSopenharmony_ci ld1 {v7.S}[0], [x1], x2 665cabdff1aSopenharmony_ci ld1 {v7.S}[1], [x1], x2 666cabdff1aSopenharmony_ci \macs v20.8H, v1.8B, v7.8B 667cabdff1aSopenharmony_ci sshl v2.8H, v2.8H, v18.8H 668cabdff1aSopenharmony_ci sqxtun v2.8B, v2.8H 669cabdff1aSopenharmony_ci sshl v20.8H, v20.8H, v18.8H 670cabdff1aSopenharmony_ci sqxtun v4.8B, v20.8H 671cabdff1aSopenharmony_ci mov v20.16B, v16.16B 672cabdff1aSopenharmony_ci st1 {v2.S}[0], [x7], x2 673cabdff1aSopenharmony_ci st1 {v2.S}[1], [x7], x2 674cabdff1aSopenharmony_ci mov v2.16B, v16.16B 675cabdff1aSopenharmony_ci st1 {v4.S}[0], [x7], x2 676cabdff1aSopenharmony_ci st1 {v4.S}[1], [x7], x2 677cabdff1aSopenharmony_ci b.ne 1b 678cabdff1aSopenharmony_ci ret 679cabdff1aSopenharmony_ci2: sshl v2.8H, v2.8H, v18.8H 680cabdff1aSopenharmony_ci sqxtun v2.8B, v2.8H 681cabdff1aSopenharmony_ci st1 {v2.S}[0], [x7], x2 682cabdff1aSopenharmony_ci st1 {v2.S}[1], [x7], x2 683cabdff1aSopenharmony_ci ret 684cabdff1aSopenharmony_ci.endm 685cabdff1aSopenharmony_ci 686cabdff1aSopenharmony_ci.macro biweight_func w 687cabdff1aSopenharmony_cifunction ff_biweight_h264_pixels_\w\()_neon, export=1 688cabdff1aSopenharmony_ci lsr w8, w5, #31 689cabdff1aSopenharmony_ci add w7, w7, #1 690cabdff1aSopenharmony_ci eor w8, w8, w6, lsr #30 691cabdff1aSopenharmony_ci orr w7, w7, #1 692cabdff1aSopenharmony_ci dup v18.8H, w4 693cabdff1aSopenharmony_ci lsl w7, w7, w4 694cabdff1aSopenharmony_ci not v18.16B, v18.16B 695cabdff1aSopenharmony_ci dup v16.8H, w7 696cabdff1aSopenharmony_ci mov x7, x0 697cabdff1aSopenharmony_ci cbz w8, 10f 698cabdff1aSopenharmony_ci subs w8, w8, #1 699cabdff1aSopenharmony_ci b.eq 20f 700cabdff1aSopenharmony_ci subs w8, w8, #1 701cabdff1aSopenharmony_ci b.eq 30f 702cabdff1aSopenharmony_ci b 40f 703cabdff1aSopenharmony_ci10: biweight_\w umlal, umlal 704cabdff1aSopenharmony_ci20: neg w5, w5 705cabdff1aSopenharmony_ci biweight_\w umlal, umlsl 706cabdff1aSopenharmony_ci30: neg w5, w5 707cabdff1aSopenharmony_ci neg w6, w6 708cabdff1aSopenharmony_ci biweight_\w umlsl, umlsl 709cabdff1aSopenharmony_ci40: neg w6, w6 710cabdff1aSopenharmony_ci biweight_\w umlsl, umlal 711cabdff1aSopenharmony_ciendfunc 712cabdff1aSopenharmony_ci.endm 713cabdff1aSopenharmony_ci 714cabdff1aSopenharmony_ci biweight_func 16 715cabdff1aSopenharmony_ci biweight_func 8 716cabdff1aSopenharmony_ci biweight_func 4 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci.macro weight_16 add 719cabdff1aSopenharmony_ci dup v0.16B, w4 720cabdff1aSopenharmony_ci1: subs w2, w2, #2 721cabdff1aSopenharmony_ci ld1 {v20.16B}, [x0], x1 722cabdff1aSopenharmony_ci umull v4.8H, v0.8B, v20.8B 723cabdff1aSopenharmony_ci umull2 v6.8H, v0.16B, v20.16B 724cabdff1aSopenharmony_ci ld1 {v28.16B}, [x0], x1 725cabdff1aSopenharmony_ci umull v24.8H, v0.8B, v28.8B 726cabdff1aSopenharmony_ci umull2 v26.8H, v0.16B, v28.16B 727cabdff1aSopenharmony_ci \add v4.8H, v16.8H, v4.8H 728cabdff1aSopenharmony_ci srshl v4.8H, v4.8H, v18.8H 729cabdff1aSopenharmony_ci \add v6.8H, v16.8H, v6.8H 730cabdff1aSopenharmony_ci srshl v6.8H, v6.8H, v18.8H 731cabdff1aSopenharmony_ci sqxtun v4.8B, v4.8H 732cabdff1aSopenharmony_ci sqxtun2 v4.16B, v6.8H 733cabdff1aSopenharmony_ci \add v24.8H, v16.8H, v24.8H 734cabdff1aSopenharmony_ci srshl v24.8H, v24.8H, v18.8H 735cabdff1aSopenharmony_ci \add v26.8H, v16.8H, v26.8H 736cabdff1aSopenharmony_ci srshl v26.8H, v26.8H, v18.8H 737cabdff1aSopenharmony_ci sqxtun v24.8B, v24.8H 738cabdff1aSopenharmony_ci sqxtun2 v24.16B, v26.8H 739cabdff1aSopenharmony_ci st1 {v4.16B}, [x5], x1 740cabdff1aSopenharmony_ci st1 {v24.16B}, [x5], x1 741cabdff1aSopenharmony_ci b.ne 1b 742cabdff1aSopenharmony_ci ret 743cabdff1aSopenharmony_ci.endm 744cabdff1aSopenharmony_ci 745cabdff1aSopenharmony_ci.macro weight_8 add 746cabdff1aSopenharmony_ci dup v0.8B, w4 747cabdff1aSopenharmony_ci1: subs w2, w2, #2 748cabdff1aSopenharmony_ci ld1 {v4.8B}, [x0], x1 749cabdff1aSopenharmony_ci umull v2.8H, v0.8B, v4.8B 750cabdff1aSopenharmony_ci ld1 {v6.8B}, [x0], x1 751cabdff1aSopenharmony_ci umull v20.8H, v0.8B, v6.8B 752cabdff1aSopenharmony_ci \add v2.8H, v16.8H, v2.8H 753cabdff1aSopenharmony_ci srshl v2.8H, v2.8H, v18.8H 754cabdff1aSopenharmony_ci sqxtun v2.8B, v2.8H 755cabdff1aSopenharmony_ci \add v20.8H, v16.8H, v20.8H 756cabdff1aSopenharmony_ci srshl v20.8H, v20.8H, v18.8H 757cabdff1aSopenharmony_ci sqxtun v4.8B, v20.8H 758cabdff1aSopenharmony_ci st1 {v2.8B}, [x5], x1 759cabdff1aSopenharmony_ci st1 {v4.8B}, [x5], x1 760cabdff1aSopenharmony_ci b.ne 1b 761cabdff1aSopenharmony_ci ret 762cabdff1aSopenharmony_ci.endm 763cabdff1aSopenharmony_ci 764cabdff1aSopenharmony_ci.macro weight_4 add 765cabdff1aSopenharmony_ci dup v0.8B, w4 766cabdff1aSopenharmony_ci1: subs w2, w2, #4 767cabdff1aSopenharmony_ci ld1 {v4.S}[0], [x0], x1 768cabdff1aSopenharmony_ci ld1 {v4.S}[1], [x0], x1 769cabdff1aSopenharmony_ci umull v2.8H, v0.8B, v4.8B 770cabdff1aSopenharmony_ci b.lt 2f 771cabdff1aSopenharmony_ci ld1 {v6.S}[0], [x0], x1 772cabdff1aSopenharmony_ci ld1 {v6.S}[1], [x0], x1 773cabdff1aSopenharmony_ci umull v20.8H, v0.8B, v6.8B 774cabdff1aSopenharmony_ci \add v2.8H, v16.8H, v2.8H 775cabdff1aSopenharmony_ci srshl v2.8H, v2.8H, v18.8H 776cabdff1aSopenharmony_ci sqxtun v2.8B, v2.8H 777cabdff1aSopenharmony_ci \add v20.8H, v16.8H, v20.8H 778cabdff1aSopenharmony_ci srshl v20.8H, v20.8h, v18.8H 779cabdff1aSopenharmony_ci sqxtun v4.8B, v20.8H 780cabdff1aSopenharmony_ci st1 {v2.S}[0], [x5], x1 781cabdff1aSopenharmony_ci st1 {v2.S}[1], [x5], x1 782cabdff1aSopenharmony_ci st1 {v4.S}[0], [x5], x1 783cabdff1aSopenharmony_ci st1 {v4.S}[1], [x5], x1 784cabdff1aSopenharmony_ci b.ne 1b 785cabdff1aSopenharmony_ci ret 786cabdff1aSopenharmony_ci2: \add v2.8H, v16.8H, v2.8H 787cabdff1aSopenharmony_ci srshl v2.8H, v2.8H, v18.8H 788cabdff1aSopenharmony_ci sqxtun v2.8B, v2.8H 789cabdff1aSopenharmony_ci st1 {v2.S}[0], [x5], x1 790cabdff1aSopenharmony_ci st1 {v2.S}[1], [x5], x1 791cabdff1aSopenharmony_ci ret 792cabdff1aSopenharmony_ci.endm 793cabdff1aSopenharmony_ci 794cabdff1aSopenharmony_ci.macro weight_func w 795cabdff1aSopenharmony_cifunction ff_weight_h264_pixels_\w\()_neon, export=1 796cabdff1aSopenharmony_ci cmp w3, #1 797cabdff1aSopenharmony_ci mov w6, #1 798cabdff1aSopenharmony_ci lsl w5, w5, w3 799cabdff1aSopenharmony_ci dup v16.8H, w5 800cabdff1aSopenharmony_ci mov x5, x0 801cabdff1aSopenharmony_ci b.le 20f 802cabdff1aSopenharmony_ci sub w6, w6, w3 803cabdff1aSopenharmony_ci dup v18.8H, w6 804cabdff1aSopenharmony_ci cmp w4, #0 805cabdff1aSopenharmony_ci b.lt 10f 806cabdff1aSopenharmony_ci weight_\w shadd 807cabdff1aSopenharmony_ci10: neg w4, w4 808cabdff1aSopenharmony_ci weight_\w shsub 809cabdff1aSopenharmony_ci20: neg w6, w3 810cabdff1aSopenharmony_ci dup v18.8H, w6 811cabdff1aSopenharmony_ci cmp w4, #0 812cabdff1aSopenharmony_ci b.lt 10f 813cabdff1aSopenharmony_ci weight_\w add 814cabdff1aSopenharmony_ci10: neg w4, w4 815cabdff1aSopenharmony_ci weight_\w sub 816cabdff1aSopenharmony_ciendfunc 817cabdff1aSopenharmony_ci.endm 818cabdff1aSopenharmony_ci 819cabdff1aSopenharmony_ci weight_func 16 820cabdff1aSopenharmony_ci weight_func 8 821cabdff1aSopenharmony_ci weight_func 4 822cabdff1aSopenharmony_ci 823cabdff1aSopenharmony_ci.macro h264_loop_filter_start_10 824cabdff1aSopenharmony_ci cmp w2, #0 825cabdff1aSopenharmony_ci ldr w6, [x4] 826cabdff1aSopenharmony_ci ccmp w3, #0, #0, ne 827cabdff1aSopenharmony_ci lsl w2, w2, #2 828cabdff1aSopenharmony_ci mov v24.S[0], w6 829cabdff1aSopenharmony_ci lsl w3, w3, #2 830cabdff1aSopenharmony_ci and w8, w6, w6, lsl #16 831cabdff1aSopenharmony_ci b.eq 1f 832cabdff1aSopenharmony_ci ands w8, w8, w8, lsl #8 833cabdff1aSopenharmony_ci b.ge 2f 834cabdff1aSopenharmony_ci1: 835cabdff1aSopenharmony_ci ret 836cabdff1aSopenharmony_ci2: 837cabdff1aSopenharmony_ci.endm 838cabdff1aSopenharmony_ci 839cabdff1aSopenharmony_ci.macro h264_loop_filter_start_intra_10 840cabdff1aSopenharmony_ci orr w4, w2, w3 841cabdff1aSopenharmony_ci cbnz w4, 1f 842cabdff1aSopenharmony_ci ret 843cabdff1aSopenharmony_ci1: 844cabdff1aSopenharmony_ci lsl w2, w2, #2 845cabdff1aSopenharmony_ci lsl w3, w3, #2 846cabdff1aSopenharmony_ci dup v30.8h, w2 // alpha 847cabdff1aSopenharmony_ci dup v31.8h, w3 // beta 848cabdff1aSopenharmony_ci.endm 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_ci.macro h264_loop_filter_chroma_10 851cabdff1aSopenharmony_ci dup v22.8h, w2 // alpha 852cabdff1aSopenharmony_ci dup v23.8h, w3 // beta 853cabdff1aSopenharmony_ci uxtl v24.8h, v24.8b // tc0 854cabdff1aSopenharmony_ci 855cabdff1aSopenharmony_ci uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) 856cabdff1aSopenharmony_ci uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) 857cabdff1aSopenharmony_ci uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) 858cabdff1aSopenharmony_ci cmhi v26.8h, v22.8h, v26.8h // < alpha 859cabdff1aSopenharmony_ci cmhi v28.8h, v23.8h, v28.8h // < beta 860cabdff1aSopenharmony_ci cmhi v30.8h, v23.8h, v30.8h // < beta 861cabdff1aSopenharmony_ci 862cabdff1aSopenharmony_ci and v26.16b, v26.16b, v28.16b 863cabdff1aSopenharmony_ci mov v4.16b, v0.16b 864cabdff1aSopenharmony_ci sub v4.8h, v4.8h, v16.8h 865cabdff1aSopenharmony_ci and v26.16b, v26.16b, v30.16b 866cabdff1aSopenharmony_ci shl v4.8h, v4.8h, #2 867cabdff1aSopenharmony_ci mov x8, v26.d[0] 868cabdff1aSopenharmony_ci mov x9, v26.d[1] 869cabdff1aSopenharmony_ci sli v24.8h, v24.8h, #8 870cabdff1aSopenharmony_ci uxtl v24.8h, v24.8b 871cabdff1aSopenharmony_ci add v4.8h, v4.8h, v18.8h 872cabdff1aSopenharmony_ci adds x8, x8, x9 873cabdff1aSopenharmony_ci shl v24.8h, v24.8h, #2 874cabdff1aSopenharmony_ci 875cabdff1aSopenharmony_ci b.eq 9f 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1 878cabdff1aSopenharmony_ci uqsub v24.8h, v24.8h, v31.8h 879cabdff1aSopenharmony_ci sub v4.8h, v4.8h, v2.8h 880cabdff1aSopenharmony_ci srshr v4.8h, v4.8h, #3 881cabdff1aSopenharmony_ci smin v4.8h, v4.8h, v24.8h 882cabdff1aSopenharmony_ci neg v25.8h, v24.8h 883cabdff1aSopenharmony_ci smax v4.8h, v4.8h, v25.8h 884cabdff1aSopenharmony_ci and v4.16b, v4.16b, v26.16b 885cabdff1aSopenharmony_ci add v16.8h, v16.8h, v4.8h 886cabdff1aSopenharmony_ci sub v0.8h, v0.8h, v4.8h 887cabdff1aSopenharmony_ci 888cabdff1aSopenharmony_ci mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 889cabdff1aSopenharmony_ci movi v5.8h, #0 890cabdff1aSopenharmony_ci smin v0.8h, v0.8h, v4.8h 891cabdff1aSopenharmony_ci smin v16.8h, v16.8h, v4.8h 892cabdff1aSopenharmony_ci smax v0.8h, v0.8h, v5.8h 893cabdff1aSopenharmony_ci smax v16.8h, v16.8h, v5.8h 894cabdff1aSopenharmony_ci.endm 895cabdff1aSopenharmony_ci 896cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_neon_10, export=1 897cabdff1aSopenharmony_ci h264_loop_filter_start_10 898cabdff1aSopenharmony_ci 899cabdff1aSopenharmony_ci mov x10, x0 900cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 901cabdff1aSopenharmony_ci ld1 {v18.8h}, [x0 ], x1 902cabdff1aSopenharmony_ci ld1 {v0.8h}, [x10], x1 903cabdff1aSopenharmony_ci ld1 {v16.8h}, [x0 ], x1 904cabdff1aSopenharmony_ci ld1 {v2.8h}, [x10] 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_ci h264_loop_filter_chroma_10 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_ci sub x0, x10, x1, lsl #1 909cabdff1aSopenharmony_ci st1 {v16.8h}, [x0], x1 910cabdff1aSopenharmony_ci st1 {v0.8h}, [x0], x1 911cabdff1aSopenharmony_ci9: 912cabdff1aSopenharmony_ci ret 913cabdff1aSopenharmony_ciendfunc 914cabdff1aSopenharmony_ci 915cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_neon_10, export=1 916cabdff1aSopenharmony_ci h264_loop_filter_start_10 917cabdff1aSopenharmony_ci 918cabdff1aSopenharmony_ci sub x0, x0, #4 // access the 2nd left pixel 919cabdff1aSopenharmony_cih_loop_filter_chroma420_10: 920cabdff1aSopenharmony_ci add x10, x0, x1, lsl #2 921cabdff1aSopenharmony_ci ld1 {v18.d}[0], [x0 ], x1 922cabdff1aSopenharmony_ci ld1 {v18.d}[1], [x10], x1 923cabdff1aSopenharmony_ci ld1 {v16.d}[0], [x0 ], x1 924cabdff1aSopenharmony_ci ld1 {v16.d}[1], [x10], x1 925cabdff1aSopenharmony_ci ld1 {v0.d}[0], [x0 ], x1 926cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x10], x1 927cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x0 ], x1 928cabdff1aSopenharmony_ci ld1 {v2.d}[1], [x10], x1 929cabdff1aSopenharmony_ci 930cabdff1aSopenharmony_ci transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 931cabdff1aSopenharmony_ci 932cabdff1aSopenharmony_ci h264_loop_filter_chroma_10 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_ci transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 935cabdff1aSopenharmony_ci 936cabdff1aSopenharmony_ci sub x0, x10, x1, lsl #3 937cabdff1aSopenharmony_ci st1 {v18.d}[0], [x0], x1 938cabdff1aSopenharmony_ci st1 {v16.d}[0], [x0], x1 939cabdff1aSopenharmony_ci st1 {v0.d}[0], [x0], x1 940cabdff1aSopenharmony_ci st1 {v2.d}[0], [x0], x1 941cabdff1aSopenharmony_ci st1 {v18.d}[1], [x0], x1 942cabdff1aSopenharmony_ci st1 {v16.d}[1], [x0], x1 943cabdff1aSopenharmony_ci st1 {v0.d}[1], [x0], x1 944cabdff1aSopenharmony_ci st1 {v2.d}[1], [x0], x1 945cabdff1aSopenharmony_ci9: 946cabdff1aSopenharmony_ci ret 947cabdff1aSopenharmony_ciendfunc 948cabdff1aSopenharmony_ci 949cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_neon_10, export=1 950cabdff1aSopenharmony_ci h264_loop_filter_start_10 951cabdff1aSopenharmony_ci add x5, x0, x1 952cabdff1aSopenharmony_ci sub x0, x0, #4 953cabdff1aSopenharmony_ci add x1, x1, x1 954cabdff1aSopenharmony_ci mov x7, x30 955cabdff1aSopenharmony_ci bl h_loop_filter_chroma420_10 956cabdff1aSopenharmony_ci mov x30, x7 957cabdff1aSopenharmony_ci sub x0, x5, #4 958cabdff1aSopenharmony_ci mov v24.s[0], w6 959cabdff1aSopenharmony_ci b h_loop_filter_chroma420_10 960cabdff1aSopenharmony_ciendfunc 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci.macro h264_loop_filter_chroma_intra_10 963cabdff1aSopenharmony_ci uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) 964cabdff1aSopenharmony_ci uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) 965cabdff1aSopenharmony_ci uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) 966cabdff1aSopenharmony_ci cmhi v26.8h, v30.8h, v26.8h // < alpha 967cabdff1aSopenharmony_ci cmhi v27.8h, v31.8h, v27.8h // < beta 968cabdff1aSopenharmony_ci cmhi v28.8h, v31.8h, v28.8h // < beta 969cabdff1aSopenharmony_ci and v26.16b, v26.16b, v27.16b 970cabdff1aSopenharmony_ci and v26.16b, v26.16b, v28.16b 971cabdff1aSopenharmony_ci mov x2, v26.d[0] 972cabdff1aSopenharmony_ci mov x3, v26.d[1] 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_ci shl v4.8h, v18.8h, #1 975cabdff1aSopenharmony_ci shl v6.8h, v19.8h, #1 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_ci adds x2, x2, x3 978cabdff1aSopenharmony_ci b.eq 9f 979cabdff1aSopenharmony_ci 980cabdff1aSopenharmony_ci add v20.8h, v16.8h, v19.8h 981cabdff1aSopenharmony_ci add v22.8h, v17.8h, v18.8h 982cabdff1aSopenharmony_ci add v20.8h, v20.8h, v4.8h 983cabdff1aSopenharmony_ci add v22.8h, v22.8h, v6.8h 984cabdff1aSopenharmony_ci urshr v24.8h, v20.8h, #2 985cabdff1aSopenharmony_ci urshr v25.8h, v22.8h, #2 986cabdff1aSopenharmony_ci bit v16.16b, v24.16b, v26.16b 987cabdff1aSopenharmony_ci bit v17.16b, v25.16b, v26.16b 988cabdff1aSopenharmony_ci.endm 989cabdff1aSopenharmony_ci 990cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 991cabdff1aSopenharmony_ci h264_loop_filter_start_intra_10 992cabdff1aSopenharmony_ci mov x9, x0 993cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #1 994cabdff1aSopenharmony_ci ld1 {v18.8h}, [x0], x1 995cabdff1aSopenharmony_ci ld1 {v17.8h}, [x9], x1 996cabdff1aSopenharmony_ci ld1 {v16.8h}, [x0], x1 997cabdff1aSopenharmony_ci ld1 {v19.8h}, [x9] 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_ci h264_loop_filter_chroma_intra_10 1000cabdff1aSopenharmony_ci 1001cabdff1aSopenharmony_ci sub x0, x9, x1, lsl #1 1002cabdff1aSopenharmony_ci st1 {v16.8h}, [x0], x1 1003cabdff1aSopenharmony_ci st1 {v17.8h}, [x0], x1 1004cabdff1aSopenharmony_ci 1005cabdff1aSopenharmony_ci9: 1006cabdff1aSopenharmony_ci ret 1007cabdff1aSopenharmony_ciendfunc 1008cabdff1aSopenharmony_ci 1009cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 1010cabdff1aSopenharmony_ci h264_loop_filter_start_intra_10 1011cabdff1aSopenharmony_ci 1012cabdff1aSopenharmony_ci sub x4, x0, #4 1013cabdff1aSopenharmony_ci sub x0, x0, #2 1014cabdff1aSopenharmony_ci add x9, x4, x1, lsl #1 1015cabdff1aSopenharmony_ci ld1 {v18.8h}, [x4], x1 1016cabdff1aSopenharmony_ci ld1 {v17.8h}, [x9], x1 1017cabdff1aSopenharmony_ci ld1 {v16.8h}, [x4], x1 1018cabdff1aSopenharmony_ci ld1 {v19.8h}, [x9], x1 1019cabdff1aSopenharmony_ci 1020cabdff1aSopenharmony_ci transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 1021cabdff1aSopenharmony_ci 1022cabdff1aSopenharmony_ci h264_loop_filter_chroma_intra_10 1023cabdff1aSopenharmony_ci 1024cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[0], [x0], x1 1025cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[1], [x0], x1 1026cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[2], [x0], x1 1027cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[3], [x0], x1 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci9: 1030cabdff1aSopenharmony_ci ret 1031cabdff1aSopenharmony_ciendfunc 1032cabdff1aSopenharmony_ci 1033cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 1034cabdff1aSopenharmony_ci h264_loop_filter_start_intra_10 1035cabdff1aSopenharmony_ci sub x4, x0, #4 1036cabdff1aSopenharmony_ci sub x0, x0, #2 1037cabdff1aSopenharmony_cih_loop_filter_chroma420_intra_10: 1038cabdff1aSopenharmony_ci add x9, x4, x1, lsl #2 1039cabdff1aSopenharmony_ci ld1 {v18.4h}, [x4], x1 1040cabdff1aSopenharmony_ci ld1 {v18.d}[1], [x9], x1 1041cabdff1aSopenharmony_ci ld1 {v16.4h}, [x4], x1 1042cabdff1aSopenharmony_ci ld1 {v16.d}[1], [x9], x1 1043cabdff1aSopenharmony_ci ld1 {v17.4h}, [x4], x1 1044cabdff1aSopenharmony_ci ld1 {v17.d}[1], [x9], x1 1045cabdff1aSopenharmony_ci ld1 {v19.4h}, [x4], x1 1046cabdff1aSopenharmony_ci ld1 {v19.d}[1], [x9], x1 1047cabdff1aSopenharmony_ci 1048cabdff1aSopenharmony_ci transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 1049cabdff1aSopenharmony_ci 1050cabdff1aSopenharmony_ci h264_loop_filter_chroma_intra_10 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[0], [x0], x1 1053cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[1], [x0], x1 1054cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[2], [x0], x1 1055cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[3], [x0], x1 1056cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[4], [x0], x1 1057cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[5], [x0], x1 1058cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[6], [x0], x1 1059cabdff1aSopenharmony_ci st2 {v16.h,v17.h}[7], [x0], x1 1060cabdff1aSopenharmony_ci 1061cabdff1aSopenharmony_ci9: 1062cabdff1aSopenharmony_ci ret 1063cabdff1aSopenharmony_ciendfunc 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1 1066cabdff1aSopenharmony_ci h264_loop_filter_start_intra_10 1067cabdff1aSopenharmony_ci sub x4, x0, #4 1068cabdff1aSopenharmony_ci add x5, x0, x1, lsl #3 1069cabdff1aSopenharmony_ci sub x0, x0, #2 1070cabdff1aSopenharmony_ci mov x7, x30 1071cabdff1aSopenharmony_ci bl h_loop_filter_chroma420_intra_10 1072cabdff1aSopenharmony_ci mov x4, x9 1073cabdff1aSopenharmony_ci sub x0, x5, #2 1074cabdff1aSopenharmony_ci mov x30, x7 1075cabdff1aSopenharmony_ci b h_loop_filter_chroma420_intra_10 1076cabdff1aSopenharmony_ciendfunc 1077