1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci.macro ldcol.8 rd, rs, rt, n=8, hi=0 24cabdff1aSopenharmony_ci.if \n >= 8 || \hi == 0 25cabdff1aSopenharmony_ci ld1 {\rd\().b}[0], [\rs], \rt 26cabdff1aSopenharmony_ci ld1 {\rd\().b}[1], [\rs], \rt 27cabdff1aSopenharmony_ci ld1 {\rd\().b}[2], [\rs], \rt 28cabdff1aSopenharmony_ci ld1 {\rd\().b}[3], [\rs], \rt 29cabdff1aSopenharmony_ci.endif 30cabdff1aSopenharmony_ci.if \n >= 8 || \hi == 1 31cabdff1aSopenharmony_ci ld1 {\rd\().b}[4], [\rs], \rt 32cabdff1aSopenharmony_ci ld1 {\rd\().b}[5], [\rs], \rt 33cabdff1aSopenharmony_ci ld1 {\rd\().b}[6], [\rs], \rt 34cabdff1aSopenharmony_ci ld1 {\rd\().b}[7], [\rs], \rt 35cabdff1aSopenharmony_ci.endif 36cabdff1aSopenharmony_ci.if \n == 16 37cabdff1aSopenharmony_ci ld1 {\rd\().b}[8], [\rs], \rt 38cabdff1aSopenharmony_ci ld1 {\rd\().b}[9], [\rs], \rt 39cabdff1aSopenharmony_ci ld1 {\rd\().b}[10], [\rs], \rt 40cabdff1aSopenharmony_ci ld1 {\rd\().b}[11], [\rs], \rt 41cabdff1aSopenharmony_ci ld1 {\rd\().b}[12], [\rs], \rt 42cabdff1aSopenharmony_ci ld1 {\rd\().b}[13], [\rs], \rt 43cabdff1aSopenharmony_ci ld1 {\rd\().b}[14], [\rs], \rt 44cabdff1aSopenharmony_ci ld1 {\rd\().b}[15], [\rs], \rt 45cabdff1aSopenharmony_ci.endif 46cabdff1aSopenharmony_ci.endm 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_cifunction ff_pred16x16_128_dc_neon, export=1 49cabdff1aSopenharmony_ci movi v0.16b, #128 50cabdff1aSopenharmony_ci b .L_pred16x16_dc_end 51cabdff1aSopenharmony_ciendfunc 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_cifunction ff_pred16x16_top_dc_neon, export=1 54cabdff1aSopenharmony_ci sub x2, x0, x1 55cabdff1aSopenharmony_ci ld1 {v0.16b}, [x2] 56cabdff1aSopenharmony_ci uaddlv h0, v0.16b 57cabdff1aSopenharmony_ci rshrn v0.8b, v0.8h, #4 58cabdff1aSopenharmony_ci dup v0.16b, v0.b[0] 59cabdff1aSopenharmony_ci b .L_pred16x16_dc_end 60cabdff1aSopenharmony_ciendfunc 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_cifunction ff_pred16x16_left_dc_neon, export=1 63cabdff1aSopenharmony_ci sub x2, x0, #1 64cabdff1aSopenharmony_ci ldcol.8 v0, x2, x1, 16 65cabdff1aSopenharmony_ci uaddlv h0, v0.16b 66cabdff1aSopenharmony_ci rshrn v0.8b, v0.8h, #4 67cabdff1aSopenharmony_ci dup v0.16b, v0.b[0] 68cabdff1aSopenharmony_ci b .L_pred16x16_dc_end 69cabdff1aSopenharmony_ciendfunc 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_cifunction ff_pred16x16_dc_neon, export=1 72cabdff1aSopenharmony_ci sub x2, x0, x1 73cabdff1aSopenharmony_ci sub x3, x0, #1 74cabdff1aSopenharmony_ci ld1 {v0.16b}, [x2] 75cabdff1aSopenharmony_ci ldcol.8 v1, x3, x1, 16 76cabdff1aSopenharmony_ci uaddlv h0, v0.16b 77cabdff1aSopenharmony_ci uaddlv h1, v1.16b 78cabdff1aSopenharmony_ci add v0.4h, v0.4h, v1.4h 79cabdff1aSopenharmony_ci rshrn v0.8b, v0.8h, #5 80cabdff1aSopenharmony_ci dup v0.16b, v0.b[0] 81cabdff1aSopenharmony_ci.L_pred16x16_dc_end: 82cabdff1aSopenharmony_ci mov w3, #8 83cabdff1aSopenharmony_ci6: st1 {v0.16b}, [x0], x1 84cabdff1aSopenharmony_ci subs w3, w3, #1 85cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 86cabdff1aSopenharmony_ci b.ne 6b 87cabdff1aSopenharmony_ci ret 88cabdff1aSopenharmony_ciendfunc 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_cifunction ff_pred16x16_hor_neon, export=1 91cabdff1aSopenharmony_ci sub x2, x0, #1 92cabdff1aSopenharmony_ci mov w3, #16 93cabdff1aSopenharmony_ci1: ld1r {v0.16b}, [x2], x1 94cabdff1aSopenharmony_ci subs w3, w3, #1 95cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 96cabdff1aSopenharmony_ci b.ne 1b 97cabdff1aSopenharmony_ci ret 98cabdff1aSopenharmony_ciendfunc 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_cifunction ff_pred16x16_vert_neon, export=1 101cabdff1aSopenharmony_ci sub x2, x0, x1 102cabdff1aSopenharmony_ci add x1, x1, x1 103cabdff1aSopenharmony_ci ld1 {v0.16b}, [x2], x1 104cabdff1aSopenharmony_ci mov w3, #8 105cabdff1aSopenharmony_ci1: subs w3, w3, #1 106cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 107cabdff1aSopenharmony_ci st1 {v0.16b}, [x2], x1 108cabdff1aSopenharmony_ci b.ne 1b 109cabdff1aSopenharmony_ci ret 110cabdff1aSopenharmony_ciendfunc 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_cifunction ff_pred16x16_plane_neon, export=1 113cabdff1aSopenharmony_ci sub x3, x0, x1 114cabdff1aSopenharmony_ci movrel x4, p16weight 115cabdff1aSopenharmony_ci add x2, x3, #8 116cabdff1aSopenharmony_ci sub x3, x3, #1 117cabdff1aSopenharmony_ci ld1 {v0.8b}, [x3] 118cabdff1aSopenharmony_ci ld1 {v2.8b}, [x2], x1 119cabdff1aSopenharmony_ci ldcol.8 v1, x3, x1 120cabdff1aSopenharmony_ci add x3, x3, x1 121cabdff1aSopenharmony_ci ldcol.8 v3, x3, x1 122cabdff1aSopenharmony_ci rev64 v0.8b, v0.8b 123cabdff1aSopenharmony_ci rev64 v1.8b, v1.8b 124cabdff1aSopenharmony_ci uaddl v7.8h, v2.8b, v3.8b 125cabdff1aSopenharmony_ci usubl v2.8h, v2.8b, v0.8b 126cabdff1aSopenharmony_ci usubl v3.8h, v3.8b, v1.8b 127cabdff1aSopenharmony_ci ld1 {v0.8h}, [x4] 128cabdff1aSopenharmony_ci mul v2.8h, v2.8h, v0.8h 129cabdff1aSopenharmony_ci mul v3.8h, v3.8h, v0.8h 130cabdff1aSopenharmony_ci addp v2.8h, v2.8h, v3.8h 131cabdff1aSopenharmony_ci addp v2.8h, v2.8h, v2.8h 132cabdff1aSopenharmony_ci addp v2.4h, v2.4h, v2.4h 133cabdff1aSopenharmony_ci sshll v3.4s, v2.4h, #2 134cabdff1aSopenharmony_ci saddw v2.4s, v3.4s, v2.4h 135cabdff1aSopenharmony_ci rshrn v4.4h, v2.4s, #6 136cabdff1aSopenharmony_ci trn2 v5.4h, v4.4h, v4.4h 137cabdff1aSopenharmony_ci add v2.4h, v4.4h, v5.4h 138cabdff1aSopenharmony_ci shl v3.4h, v2.4h, #3 139cabdff1aSopenharmony_ci ext v7.16b, v7.16b, v7.16b, #14 140cabdff1aSopenharmony_ci sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 141cabdff1aSopenharmony_ci add v7.4h, v7.4h, v0.4h 142cabdff1aSopenharmony_ci shl v2.4h, v7.4h, #4 143cabdff1aSopenharmony_ci sub v2.4h, v2.4h, v3.4h 144cabdff1aSopenharmony_ci shl v3.4h, v4.4h, #4 145cabdff1aSopenharmony_ci ext v0.16b, v0.16b, v0.16b, #14 146cabdff1aSopenharmony_ci sub v6.4h, v5.4h, v3.4h 147cabdff1aSopenharmony_ci mov v0.h[0], wzr 148cabdff1aSopenharmony_ci mul v0.8h, v0.8h, v4.h[0] 149cabdff1aSopenharmony_ci dup v1.8h, v2.h[0] 150cabdff1aSopenharmony_ci dup v2.8h, v4.h[0] 151cabdff1aSopenharmony_ci dup v3.8h, v6.h[0] 152cabdff1aSopenharmony_ci shl v2.8h, v2.8h, #3 153cabdff1aSopenharmony_ci add v1.8h, v1.8h, v0.8h 154cabdff1aSopenharmony_ci add v3.8h, v3.8h, v2.8h 155cabdff1aSopenharmony_ci mov w3, #16 156cabdff1aSopenharmony_ci1: 157cabdff1aSopenharmony_ci sqshrun v0.8b, v1.8h, #5 158cabdff1aSopenharmony_ci add v1.8h, v1.8h, v2.8h 159cabdff1aSopenharmony_ci sqshrun2 v0.16b, v1.8h, #5 160cabdff1aSopenharmony_ci add v1.8h, v1.8h, v3.8h 161cabdff1aSopenharmony_ci subs w3, w3, #1 162cabdff1aSopenharmony_ci st1 {v0.16b}, [x0], x1 163cabdff1aSopenharmony_ci b.ne 1b 164cabdff1aSopenharmony_ci ret 165cabdff1aSopenharmony_ciendfunc 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ciconst p16weight, align=4 168cabdff1aSopenharmony_ci .short 1,2,3,4,5,6,7,8 169cabdff1aSopenharmony_ciendconst 170cabdff1aSopenharmony_ciconst p8weight, align=4 171cabdff1aSopenharmony_ci .short 1,2,3,4,1,2,3,4 172cabdff1aSopenharmony_ciendconst 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_cifunction ff_pred8x8_hor_neon, export=1 175cabdff1aSopenharmony_ci sub x2, x0, #1 176cabdff1aSopenharmony_ci mov w3, #8 177cabdff1aSopenharmony_ci1: ld1r {v0.8b}, [x2], x1 178cabdff1aSopenharmony_ci subs w3, w3, #1 179cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 180cabdff1aSopenharmony_ci b.ne 1b 181cabdff1aSopenharmony_ci ret 182cabdff1aSopenharmony_ciendfunc 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_cifunction ff_pred8x8_vert_neon, export=1 185cabdff1aSopenharmony_ci sub x2, x0, x1 186cabdff1aSopenharmony_ci lsl x1, x1, #1 187cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2], x1 188cabdff1aSopenharmony_ci mov w3, #4 189cabdff1aSopenharmony_ci1: subs w3, w3, #1 190cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 191cabdff1aSopenharmony_ci st1 {v0.8b}, [x2], x1 192cabdff1aSopenharmony_ci b.ne 1b 193cabdff1aSopenharmony_ci ret 194cabdff1aSopenharmony_ciendfunc 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_cifunction ff_pred8x8_plane_neon, export=1 197cabdff1aSopenharmony_ci sub x3, x0, x1 198cabdff1aSopenharmony_ci movrel x4, p8weight 199cabdff1aSopenharmony_ci movrel x5, p16weight 200cabdff1aSopenharmony_ci add x2, x3, #4 201cabdff1aSopenharmony_ci sub x3, x3, #1 202cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x3] 203cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2], x1 204cabdff1aSopenharmony_ci ldcol.8 v0, x3, x1, 4, hi=1 205cabdff1aSopenharmony_ci add x3, x3, x1 206cabdff1aSopenharmony_ci ldcol.8 v3, x3, x1, 4 207cabdff1aSopenharmony_ci uaddl v7.8h, v2.8b, v3.8b 208cabdff1aSopenharmony_ci rev32 v0.8b, v0.8b 209cabdff1aSopenharmony_ci trn1 v2.2s, v2.2s, v3.2s 210cabdff1aSopenharmony_ci usubl v2.8h, v2.8b, v0.8b 211cabdff1aSopenharmony_ci ld1 {v6.8h}, [x4] 212cabdff1aSopenharmony_ci mul v2.8h, v2.8h, v6.8h 213cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 214cabdff1aSopenharmony_ci saddlp v2.4s, v2.8h 215cabdff1aSopenharmony_ci addp v2.4s, v2.4s, v2.4s 216cabdff1aSopenharmony_ci shl v3.4s, v2.4s, #4 217cabdff1aSopenharmony_ci add v2.4s, v3.4s, v2.4s 218cabdff1aSopenharmony_ci rshrn v5.4h, v2.4s, #5 219cabdff1aSopenharmony_ci addp v2.4h, v5.4h, v5.4h 220cabdff1aSopenharmony_ci shl v3.4h, v2.4h, #1 221cabdff1aSopenharmony_ci add v3.4h, v3.4h, v2.4h 222cabdff1aSopenharmony_ci rev64 v7.4h, v7.4h 223cabdff1aSopenharmony_ci add v7.4h, v7.4h, v0.4h 224cabdff1aSopenharmony_ci shl v2.4h, v7.4h, #4 225cabdff1aSopenharmony_ci sub v2.4h, v2.4h, v3.4h 226cabdff1aSopenharmony_ci ext v0.16b, v0.16b, v0.16b, #14 227cabdff1aSopenharmony_ci mov v0.h[0], wzr 228cabdff1aSopenharmony_ci mul v0.8h, v0.8h, v5.h[0] 229cabdff1aSopenharmony_ci dup v1.8h, v2.h[0] 230cabdff1aSopenharmony_ci dup v2.8h, v5.h[1] 231cabdff1aSopenharmony_ci add v1.8h, v1.8h, v0.8h 232cabdff1aSopenharmony_ci mov w3, #8 233cabdff1aSopenharmony_ci1: 234cabdff1aSopenharmony_ci sqshrun v0.8b, v1.8h, #5 235cabdff1aSopenharmony_ci subs w3, w3, #1 236cabdff1aSopenharmony_ci add v1.8h, v1.8h, v2.8h 237cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 238cabdff1aSopenharmony_ci b.ne 1b 239cabdff1aSopenharmony_ci ret 240cabdff1aSopenharmony_ciendfunc 241cabdff1aSopenharmony_ci 242cabdff1aSopenharmony_cifunction ff_pred8x8_128_dc_neon, export=1 243cabdff1aSopenharmony_ci movi v0.8b, #128 244cabdff1aSopenharmony_ci movi v1.8b, #128 245cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 246cabdff1aSopenharmony_ciendfunc 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_cifunction ff_pred8x8_top_dc_neon, export=1 249cabdff1aSopenharmony_ci sub x2, x0, x1 250cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2] 251cabdff1aSopenharmony_ci uaddlp v0.4h, v0.8b 252cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 253cabdff1aSopenharmony_ci zip1 v0.8h, v0.8h, v0.8h 254cabdff1aSopenharmony_ci rshrn v2.8b, v0.8h, #2 255cabdff1aSopenharmony_ci zip1 v0.8b, v2.8b, v2.8b 256cabdff1aSopenharmony_ci zip1 v1.8b, v2.8b, v2.8b 257cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 258cabdff1aSopenharmony_ciendfunc 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_cifunction ff_pred8x8_left_dc_neon, export=1 261cabdff1aSopenharmony_ci sub x2, x0, #1 262cabdff1aSopenharmony_ci ldcol.8 v0, x2, x1 263cabdff1aSopenharmony_ci uaddlp v0.4h, v0.8b 264cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 265cabdff1aSopenharmony_ci rshrn v2.8b, v0.8h, #2 266cabdff1aSopenharmony_ci dup v1.8b, v2.b[1] 267cabdff1aSopenharmony_ci dup v0.8b, v2.b[0] 268cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 269cabdff1aSopenharmony_ciendfunc 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_cifunction ff_pred8x8_dc_neon, export=1 272cabdff1aSopenharmony_ci sub x2, x0, x1 273cabdff1aSopenharmony_ci sub x3, x0, #1 274cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2] 275cabdff1aSopenharmony_ci ldcol.8 v1, x3, x1 276cabdff1aSopenharmony_ci uaddlp v0.4h, v0.8b 277cabdff1aSopenharmony_ci uaddlp v1.4h, v1.8b 278cabdff1aSopenharmony_ci trn1 v2.2s, v0.2s, v1.2s 279cabdff1aSopenharmony_ci trn2 v3.2s, v0.2s, v1.2s 280cabdff1aSopenharmony_ci addp v4.4h, v2.4h, v3.4h 281cabdff1aSopenharmony_ci addp v5.4h, v4.4h, v4.4h 282cabdff1aSopenharmony_ci rshrn v6.8b, v5.8h, #3 283cabdff1aSopenharmony_ci rshrn v7.8b, v4.8h, #2 284cabdff1aSopenharmony_ci dup v0.8b, v6.b[0] 285cabdff1aSopenharmony_ci dup v2.8b, v7.b[2] 286cabdff1aSopenharmony_ci dup v1.8b, v7.b[3] 287cabdff1aSopenharmony_ci dup v3.8b, v6.b[1] 288cabdff1aSopenharmony_ci zip1 v0.2s, v0.2s, v2.2s 289cabdff1aSopenharmony_ci zip1 v1.2s, v1.2s, v3.2s 290cabdff1aSopenharmony_ci.L_pred8x8_dc_end: 291cabdff1aSopenharmony_ci mov w3, #4 292cabdff1aSopenharmony_ci add x2, x0, x1, lsl #2 293cabdff1aSopenharmony_ci6: subs w3, w3, #1 294cabdff1aSopenharmony_ci st1 {v0.8b}, [x0], x1 295cabdff1aSopenharmony_ci st1 {v1.8b}, [x2], x1 296cabdff1aSopenharmony_ci b.ne 6b 297cabdff1aSopenharmony_ci ret 298cabdff1aSopenharmony_ciendfunc 299cabdff1aSopenharmony_ci 300cabdff1aSopenharmony_cifunction ff_pred8x8_l0t_dc_neon, export=1 301cabdff1aSopenharmony_ci sub x2, x0, x1 302cabdff1aSopenharmony_ci sub x3, x0, #1 303cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2] 304cabdff1aSopenharmony_ci ldcol.8 v1, x3, x1, 4 305cabdff1aSopenharmony_ci zip1 v0.4s, v0.4s, v1.4s 306cabdff1aSopenharmony_ci uaddlp v0.8h, v0.16b 307cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 308cabdff1aSopenharmony_ci addp v1.4h, v0.4h, v0.4h 309cabdff1aSopenharmony_ci rshrn v2.8b, v0.8h, #2 310cabdff1aSopenharmony_ci rshrn v3.8b, v1.8h, #3 311cabdff1aSopenharmony_ci dup v4.8b, v3.b[0] 312cabdff1aSopenharmony_ci dup v6.8b, v2.b[2] 313cabdff1aSopenharmony_ci dup v5.8b, v2.b[0] 314cabdff1aSopenharmony_ci zip1 v0.2s, v4.2s, v6.2s 315cabdff1aSopenharmony_ci zip1 v1.2s, v5.2s, v6.2s 316cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 317cabdff1aSopenharmony_ciendfunc 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_cifunction ff_pred8x8_l00_dc_neon, export=1 320cabdff1aSopenharmony_ci sub x2, x0, #1 321cabdff1aSopenharmony_ci ldcol.8 v0, x2, x1, 4 322cabdff1aSopenharmony_ci uaddlp v0.4h, v0.8b 323cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 324cabdff1aSopenharmony_ci rshrn v0.8b, v0.8h, #2 325cabdff1aSopenharmony_ci movi v1.8b, #128 326cabdff1aSopenharmony_ci dup v0.8b, v0.b[0] 327cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 328cabdff1aSopenharmony_ciendfunc 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_cifunction ff_pred8x8_0lt_dc_neon, export=1 331cabdff1aSopenharmony_ci add x3, x0, x1, lsl #2 332cabdff1aSopenharmony_ci sub x2, x0, x1 333cabdff1aSopenharmony_ci sub x3, x3, #1 334cabdff1aSopenharmony_ci ld1 {v0.8b}, [x2] 335cabdff1aSopenharmony_ci ldcol.8 v1, x3, x1, 4, hi=1 336cabdff1aSopenharmony_ci zip1 v0.4s, v0.4s, v1.4s 337cabdff1aSopenharmony_ci uaddlp v0.8h, v0.16b 338cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 339cabdff1aSopenharmony_ci addp v1.4h, v0.4h, v0.4h 340cabdff1aSopenharmony_ci rshrn v2.8b, v0.8h, #2 341cabdff1aSopenharmony_ci rshrn v3.8b, v1.8h, #3 342cabdff1aSopenharmony_ci dup v4.8b, v2.b[0] 343cabdff1aSopenharmony_ci dup v5.8b, v2.b[3] 344cabdff1aSopenharmony_ci dup v6.8b, v2.b[2] 345cabdff1aSopenharmony_ci dup v7.8b, v3.b[1] 346cabdff1aSopenharmony_ci zip1 v0.2s, v4.2s, v6.2s 347cabdff1aSopenharmony_ci zip1 v1.2s, v5.2s, v7.2s 348cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 349cabdff1aSopenharmony_ciendfunc 350cabdff1aSopenharmony_ci 351cabdff1aSopenharmony_cifunction ff_pred8x8_0l0_dc_neon, export=1 352cabdff1aSopenharmony_ci add x2, x0, x1, lsl #2 353cabdff1aSopenharmony_ci sub x2, x2, #1 354cabdff1aSopenharmony_ci ldcol.8 v1, x2, x1, 4 355cabdff1aSopenharmony_ci uaddlp v2.4h, v1.8b 356cabdff1aSopenharmony_ci addp v2.4h, v2.4h, v2.4h 357cabdff1aSopenharmony_ci rshrn v1.8b, v2.8h, #2 358cabdff1aSopenharmony_ci movi v0.8b, #128 359cabdff1aSopenharmony_ci dup v1.8b, v1.b[0] 360cabdff1aSopenharmony_ci b .L_pred8x8_dc_end 361cabdff1aSopenharmony_ciendfunc 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci.macro ldcol.16 rd, rs, rt, n=4, hi=0 364cabdff1aSopenharmony_ci.if \n >= 4 && \hi == 0 365cabdff1aSopenharmony_ci ld1 {\rd\().h}[0], [\rs], \rt 366cabdff1aSopenharmony_ci ld1 {\rd\().h}[1], [\rs], \rt 367cabdff1aSopenharmony_ci ld1 {\rd\().h}[2], [\rs], \rt 368cabdff1aSopenharmony_ci ld1 {\rd\().h}[3], [\rs], \rt 369cabdff1aSopenharmony_ci.endif 370cabdff1aSopenharmony_ci.if \n == 8 || \hi == 1 371cabdff1aSopenharmony_ci ld1 {\rd\().h}[4], [\rs], \rt 372cabdff1aSopenharmony_ci ld1 {\rd\().h}[5], [\rs], \rt 373cabdff1aSopenharmony_ci ld1 {\rd\().h}[6], [\rs], \rt 374cabdff1aSopenharmony_ci ld1 {\rd\().h}[7], [\rs], \rt 375cabdff1aSopenharmony_ci.endif 376cabdff1aSopenharmony_ci.endm 377cabdff1aSopenharmony_ci 378cabdff1aSopenharmony_ci// slower than C 379cabdff1aSopenharmony_ci/* 380cabdff1aSopenharmony_cifunction ff_pred16x16_128_dc_neon_10, export=1 381cabdff1aSopenharmony_ci movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci b .L_pred16x16_dc_10_end 384cabdff1aSopenharmony_ciendfunc 385cabdff1aSopenharmony_ci*/ 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_cifunction ff_pred16x16_top_dc_neon_10, export=1 388cabdff1aSopenharmony_ci sub x2, x0, x1 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci ld1 {v0.8h, v1.8h}, [x2] 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci add v0.8h, v0.8h, v1.8h 393cabdff1aSopenharmony_ci addv h0, v0.8h 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci urshr v0.4h, v0.4h, #4 396cabdff1aSopenharmony_ci dup v0.8h, v0.h[0] 397cabdff1aSopenharmony_ci b .L_pred16x16_dc_10_end 398cabdff1aSopenharmony_ciendfunc 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci// slower than C 401cabdff1aSopenharmony_ci/* 402cabdff1aSopenharmony_cifunction ff_pred16x16_left_dc_neon_10, export=1 403cabdff1aSopenharmony_ci sub x2, x0, #2 // access to the "left" column 404cabdff1aSopenharmony_ci ldcol.16 v0, x2, x1, 8 405cabdff1aSopenharmony_ci ldcol.16 v1, x2, x1, 8 // load "left" column 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci add v0.8h, v0.8h, v1.8h 408cabdff1aSopenharmony_ci addv h0, v0.8h 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ci urshr v0.4h, v0.4h, #4 411cabdff1aSopenharmony_ci dup v0.8h, v0.h[0] 412cabdff1aSopenharmony_ci b .L_pred16x16_dc_10_end 413cabdff1aSopenharmony_ciendfunc 414cabdff1aSopenharmony_ci*/ 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_cifunction ff_pred16x16_dc_neon_10, export=1 417cabdff1aSopenharmony_ci sub x2, x0, x1 // access to the "top" row 418cabdff1aSopenharmony_ci sub x3, x0, #2 // access to the "left" column 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci ld1 {v0.8h, v1.8h}, [x2] 421cabdff1aSopenharmony_ci ldcol.16 v2, x3, x1, 8 422cabdff1aSopenharmony_ci ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci add v0.8h, v0.8h, v1.8h 425cabdff1aSopenharmony_ci add v2.8h, v2.8h, v3.8h 426cabdff1aSopenharmony_ci add v0.8h, v0.8h, v2.8h 427cabdff1aSopenharmony_ci addv h0, v0.8h 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ci urshr v0.4h, v0.4h, #5 430cabdff1aSopenharmony_ci dup v0.8h, v0.h[0] 431cabdff1aSopenharmony_ci.L_pred16x16_dc_10_end: 432cabdff1aSopenharmony_ci mov v1.16b, v0.16b 433cabdff1aSopenharmony_ci mov w3, #8 434cabdff1aSopenharmony_ci6: st1 {v0.8h, v1.8h}, [x0], x1 435cabdff1aSopenharmony_ci subs w3, w3, #1 436cabdff1aSopenharmony_ci st1 {v0.8h, v1.8h}, [x0], x1 437cabdff1aSopenharmony_ci b.ne 6b 438cabdff1aSopenharmony_ci ret 439cabdff1aSopenharmony_ciendfunc 440cabdff1aSopenharmony_ci 441cabdff1aSopenharmony_cifunction ff_pred16x16_hor_neon_10, export=1 442cabdff1aSopenharmony_ci sub x2, x0, #2 443cabdff1aSopenharmony_ci add x3, x0, #16 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci mov w4, #16 446cabdff1aSopenharmony_ci1: ld1r {v0.8h}, [x2], x1 447cabdff1aSopenharmony_ci subs w4, w4, #1 448cabdff1aSopenharmony_ci st1 {v0.8h}, [x0], x1 449cabdff1aSopenharmony_ci st1 {v0.8h}, [x3], x1 450cabdff1aSopenharmony_ci b.ne 1b 451cabdff1aSopenharmony_ci ret 452cabdff1aSopenharmony_ciendfunc 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_cifunction ff_pred16x16_vert_neon_10, export=1 455cabdff1aSopenharmony_ci sub x2, x0, x1 456cabdff1aSopenharmony_ci add x1, x1, x1 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_ci ld1 {v0.8h, v1.8h}, [x2], x1 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci mov w3, #8 461cabdff1aSopenharmony_ci1: subs w3, w3, #1 462cabdff1aSopenharmony_ci st1 {v0.8h, v1.8h}, [x0], x1 463cabdff1aSopenharmony_ci st1 {v0.8h, v1.8h}, [x2], x1 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci b.ne 1b 466cabdff1aSopenharmony_ci ret 467cabdff1aSopenharmony_ciendfunc 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_cifunction ff_pred16x16_plane_neon_10, export=1 470cabdff1aSopenharmony_ci sub x3, x0, x1 471cabdff1aSopenharmony_ci movrel x4, p16weight 472cabdff1aSopenharmony_ci add x2, x3, #16 473cabdff1aSopenharmony_ci sub x3, x3, #2 474cabdff1aSopenharmony_ci ld1 {v0.8h}, [x3] 475cabdff1aSopenharmony_ci ld1 {v2.8h}, [x2], x1 476cabdff1aSopenharmony_ci ldcol.16 v1, x3, x1, 8 477cabdff1aSopenharmony_ci add x3, x3, x1 478cabdff1aSopenharmony_ci ldcol.16 v3, x3, x1, 8 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci rev64 v16.8h, v0.8h 481cabdff1aSopenharmony_ci rev64 v17.8h, v1.8h 482cabdff1aSopenharmony_ci ext v0.16b, v16.16b, v16.16b, #8 483cabdff1aSopenharmony_ci ext v1.16b, v17.16b, v17.16b, #8 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_ci add v7.8h, v2.8h, v3.8h 486cabdff1aSopenharmony_ci sub v2.8h, v2.8h, v0.8h 487cabdff1aSopenharmony_ci sub v3.8h, v3.8h, v1.8h 488cabdff1aSopenharmony_ci ld1 {v0.8h}, [x4] 489cabdff1aSopenharmony_ci mul v2.8h, v2.8h, v0.8h 490cabdff1aSopenharmony_ci mul v3.8h, v3.8h, v0.8h 491cabdff1aSopenharmony_ci addp v2.8h, v2.8h, v3.8h 492cabdff1aSopenharmony_ci addp v2.8h, v2.8h, v2.8h 493cabdff1aSopenharmony_ci addp v2.4h, v2.4h, v2.4h 494cabdff1aSopenharmony_ci sshll v3.4s, v2.4h, #2 495cabdff1aSopenharmony_ci saddw v2.4s, v3.4s, v2.4h 496cabdff1aSopenharmony_ci rshrn v4.4h, v2.4s, #6 497cabdff1aSopenharmony_ci trn2 v5.4h, v4.4h, v4.4h 498cabdff1aSopenharmony_ci add v2.4h, v4.4h, v5.4h 499cabdff1aSopenharmony_ci shl v3.4h, v2.4h, #3 500cabdff1aSopenharmony_ci ext v7.16b, v7.16b, v7.16b, #14 501cabdff1aSopenharmony_ci sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 502cabdff1aSopenharmony_ci add v7.4h, v7.4h, v0.4h 503cabdff1aSopenharmony_ci shl v2.4h, v7.4h, #4 504cabdff1aSopenharmony_ci ssubl v2.4s, v2.4h, v3.4h 505cabdff1aSopenharmony_ci shl v3.4h, v4.4h, #4 506cabdff1aSopenharmony_ci ext v0.16b, v0.16b, v0.16b, #14 507cabdff1aSopenharmony_ci ssubl v6.4s, v5.4h, v3.4h 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci mov v0.h[0], wzr 510cabdff1aSopenharmony_ci mul v0.8h, v0.8h, v4.h[0] 511cabdff1aSopenharmony_ci dup v16.4s, v2.s[0] 512cabdff1aSopenharmony_ci dup v17.4s, v2.s[0] 513cabdff1aSopenharmony_ci dup v2.8h, v4.h[0] 514cabdff1aSopenharmony_ci dup v3.4s, v6.s[0] 515cabdff1aSopenharmony_ci shl v2.8h, v2.8h, #3 516cabdff1aSopenharmony_ci saddw v16.4s, v16.4s, v0.4h 517cabdff1aSopenharmony_ci saddw2 v17.4s, v17.4s, v0.8h 518cabdff1aSopenharmony_ci saddw v3.4s, v3.4s, v2.4h 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci mov w3, #16 521cabdff1aSopenharmony_ci mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 522cabdff1aSopenharmony_ci1: 523cabdff1aSopenharmony_ci sqshrun v0.4h, v16.4s, #5 524cabdff1aSopenharmony_ci sqshrun2 v0.8h, v17.4s, #5 525cabdff1aSopenharmony_ci saddw v16.4s, v16.4s, v2.4h 526cabdff1aSopenharmony_ci saddw v17.4s, v17.4s, v2.4h 527cabdff1aSopenharmony_ci sqshrun v1.4h, v16.4s, #5 528cabdff1aSopenharmony_ci sqshrun2 v1.8h, v17.4s, #5 529cabdff1aSopenharmony_ci add v16.4s, v16.4s, v3.4s 530cabdff1aSopenharmony_ci add v17.4s, v17.4s, v3.4s 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci subs w3, w3, #1 533cabdff1aSopenharmony_ci 534cabdff1aSopenharmony_ci smin v0.8h, v0.8h, v4.8h 535cabdff1aSopenharmony_ci smin v1.8h, v1.8h, v4.8h 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_ci st1 {v0.8h, v1.8h}, [x0], x1 538cabdff1aSopenharmony_ci b.ne 1b 539cabdff1aSopenharmony_ci ret 540cabdff1aSopenharmony_ciendfunc 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_cifunction ff_pred8x8_hor_neon_10, export=1 543cabdff1aSopenharmony_ci sub x2, x0, #2 544cabdff1aSopenharmony_ci mov w3, #8 545cabdff1aSopenharmony_ci 546cabdff1aSopenharmony_ci1: ld1r {v0.8h}, [x2], x1 547cabdff1aSopenharmony_ci subs w3, w3, #1 548cabdff1aSopenharmony_ci st1 {v0.8h}, [x0], x1 549cabdff1aSopenharmony_ci b.ne 1b 550cabdff1aSopenharmony_ci ret 551cabdff1aSopenharmony_ciendfunc 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_cifunction ff_pred8x8_vert_neon_10, export=1 554cabdff1aSopenharmony_ci sub x2, x0, x1 555cabdff1aSopenharmony_ci lsl x1, x1, #1 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci ld1 {v0.8h}, [x2], x1 558cabdff1aSopenharmony_ci mov w3, #4 559cabdff1aSopenharmony_ci1: subs w3, w3, #1 560cabdff1aSopenharmony_ci st1 {v0.8h}, [x0], x1 561cabdff1aSopenharmony_ci st1 {v0.8h}, [x2], x1 562cabdff1aSopenharmony_ci b.ne 1b 563cabdff1aSopenharmony_ci ret 564cabdff1aSopenharmony_ciendfunc 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_cifunction ff_pred8x8_plane_neon_10, export=1 567cabdff1aSopenharmony_ci sub x3, x0, x1 568cabdff1aSopenharmony_ci movrel x4, p8weight 569cabdff1aSopenharmony_ci movrel x5, p16weight 570cabdff1aSopenharmony_ci add x2, x3, #8 571cabdff1aSopenharmony_ci sub x3, x3, #2 572cabdff1aSopenharmony_ci ld1 {v0.d}[0], [x3] 573cabdff1aSopenharmony_ci ld1 {v2.d}[0], [x2], x1 574cabdff1aSopenharmony_ci ldcol.16 v0, x3, x1, hi=1 575cabdff1aSopenharmony_ci add x3, x3, x1 576cabdff1aSopenharmony_ci ldcol.16 v3, x3, x1, 4 577cabdff1aSopenharmony_ci add v7.8h, v2.8h, v3.8h 578cabdff1aSopenharmony_ci rev64 v0.8h, v0.8h 579cabdff1aSopenharmony_ci trn1 v2.2d, v2.2d, v3.2d 580cabdff1aSopenharmony_ci sub v2.8h, v2.8h, v0.8h 581cabdff1aSopenharmony_ci ld1 {v6.8h}, [x4] 582cabdff1aSopenharmony_ci mul v2.8h, v2.8h, v6.8h 583cabdff1aSopenharmony_ci ld1 {v0.8h}, [x5] 584cabdff1aSopenharmony_ci saddlp v2.4s, v2.8h 585cabdff1aSopenharmony_ci addp v2.4s, v2.4s, v2.4s 586cabdff1aSopenharmony_ci shl v3.4s, v2.4s, #4 587cabdff1aSopenharmony_ci add v2.4s, v3.4s, v2.4s 588cabdff1aSopenharmony_ci rshrn v5.4h, v2.4s, #5 589cabdff1aSopenharmony_ci addp v2.4h, v5.4h, v5.4h 590cabdff1aSopenharmony_ci shl v3.4h, v2.4h, #1 591cabdff1aSopenharmony_ci add v3.4h, v3.4h, v2.4h 592cabdff1aSopenharmony_ci rev64 v7.4h, v7.4h 593cabdff1aSopenharmony_ci add v7.4h, v7.4h, v0.4h 594cabdff1aSopenharmony_ci shl v2.4h, v7.4h, #4 595cabdff1aSopenharmony_ci ssubl v2.4s, v2.4h, v3.4h 596cabdff1aSopenharmony_ci ext v0.16b, v0.16b, v0.16b, #14 597cabdff1aSopenharmony_ci mov v0.h[0], wzr 598cabdff1aSopenharmony_ci mul v0.8h, v0.8h, v5.h[0] 599cabdff1aSopenharmony_ci dup v1.4s, v2.s[0] 600cabdff1aSopenharmony_ci dup v2.4s, v2.s[0] 601cabdff1aSopenharmony_ci dup v3.8h, v5.h[1] 602cabdff1aSopenharmony_ci saddw v1.4s, v1.4s, v0.4h 603cabdff1aSopenharmony_ci saddw2 v2.4s, v2.4s, v0.8h 604cabdff1aSopenharmony_ci mov w3, #8 605cabdff1aSopenharmony_ci mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 606cabdff1aSopenharmony_ci1: 607cabdff1aSopenharmony_ci sqshrun v0.4h, v1.4s, #5 608cabdff1aSopenharmony_ci sqshrun2 v0.8h, v2.4s, #5 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci saddw v1.4s, v1.4s, v3.4h 611cabdff1aSopenharmony_ci saddw v2.4s, v2.4s, v3.4h 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci subs w3, w3, #1 614cabdff1aSopenharmony_ci 615cabdff1aSopenharmony_ci smin v0.8h, v0.8h, v4.8h 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_ci st1 {v0.8h}, [x0], x1 618cabdff1aSopenharmony_ci b.ne 1b 619cabdff1aSopenharmony_ci ret 620cabdff1aSopenharmony_ciendfunc 621cabdff1aSopenharmony_ci 622cabdff1aSopenharmony_cifunction ff_pred8x8_128_dc_neon_10, export=1 623cabdff1aSopenharmony_ci movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) 624cabdff1aSopenharmony_ci movi v1.8h, #2, lsl #8 625cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 626cabdff1aSopenharmony_ciendfunc 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_cifunction ff_pred8x8_top_dc_neon_10, export=1 629cabdff1aSopenharmony_ci sub x2, x0, x1 630cabdff1aSopenharmony_ci ld1 {v0.8h}, [x2] 631cabdff1aSopenharmony_ci 632cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 633cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 634cabdff1aSopenharmony_ci zip1 v0.4h, v0.4h, v0.4h 635cabdff1aSopenharmony_ci urshr v2.4h, v0.4h, #2 636cabdff1aSopenharmony_ci zip1 v0.8h, v2.8h, v2.8h 637cabdff1aSopenharmony_ci zip1 v1.8h, v2.8h, v2.8h 638cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 639cabdff1aSopenharmony_ciendfunc 640cabdff1aSopenharmony_ci 641cabdff1aSopenharmony_cifunction ff_pred8x8_left_dc_neon_10, export=1 642cabdff1aSopenharmony_ci sub x2, x0, #2 643cabdff1aSopenharmony_ci ldcol.16 v0, x2, x1, 8 644cabdff1aSopenharmony_ci 645cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 646cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 647cabdff1aSopenharmony_ci urshr v2.4h, v0.4h, #2 648cabdff1aSopenharmony_ci dup v1.8h, v2.h[1] 649cabdff1aSopenharmony_ci dup v0.8h, v2.h[0] 650cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 651cabdff1aSopenharmony_ciendfunc 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_cifunction ff_pred8x8_dc_neon_10, export=1 654cabdff1aSopenharmony_ci sub x2, x0, x1 655cabdff1aSopenharmony_ci sub x3, x0, #2 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ci ld1 {v0.8h}, [x2] 658cabdff1aSopenharmony_ci ldcol.16 v1, x3, x1, 8 659cabdff1aSopenharmony_ci 660cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 661cabdff1aSopenharmony_ci addp v1.8h, v1.8h, v1.8h 662cabdff1aSopenharmony_ci trn1 v2.2s, v0.2s, v1.2s 663cabdff1aSopenharmony_ci trn2 v3.2s, v0.2s, v1.2s 664cabdff1aSopenharmony_ci addp v4.4h, v2.4h, v3.4h 665cabdff1aSopenharmony_ci addp v5.4h, v4.4h, v4.4h 666cabdff1aSopenharmony_ci urshr v6.4h, v5.4h, #3 667cabdff1aSopenharmony_ci urshr v7.4h, v4.4h, #2 668cabdff1aSopenharmony_ci dup v0.8h, v6.h[0] 669cabdff1aSopenharmony_ci dup v2.8h, v7.h[2] 670cabdff1aSopenharmony_ci dup v1.8h, v7.h[3] 671cabdff1aSopenharmony_ci dup v3.8h, v6.h[1] 672cabdff1aSopenharmony_ci zip1 v0.2d, v0.2d, v2.2d 673cabdff1aSopenharmony_ci zip1 v1.2d, v1.2d, v3.2d 674cabdff1aSopenharmony_ci.L_pred8x8_dc_10_end: 675cabdff1aSopenharmony_ci mov w3, #4 676cabdff1aSopenharmony_ci add x2, x0, x1, lsl #2 677cabdff1aSopenharmony_ci 678cabdff1aSopenharmony_ci6: st1 {v0.8h}, [x0], x1 679cabdff1aSopenharmony_ci subs w3, w3, #1 680cabdff1aSopenharmony_ci st1 {v1.8h}, [x2], x1 681cabdff1aSopenharmony_ci b.ne 6b 682cabdff1aSopenharmony_ci ret 683cabdff1aSopenharmony_ciendfunc 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_cifunction ff_pred8x8_l0t_dc_neon_10, export=1 686cabdff1aSopenharmony_ci sub x2, x0, x1 687cabdff1aSopenharmony_ci sub x3, x0, #2 688cabdff1aSopenharmony_ci 689cabdff1aSopenharmony_ci ld1 {v0.8h}, [x2] 690cabdff1aSopenharmony_ci ldcol.16 v1, x3, x1, 4 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 693cabdff1aSopenharmony_ci addp v1.4h, v1.4h, v1.4h 694cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 695cabdff1aSopenharmony_ci addp v1.4h, v1.4h, v1.4h 696cabdff1aSopenharmony_ci add v1.4h, v1.4h, v0.4h 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci urshr v2.4h, v0.4h, #2 699cabdff1aSopenharmony_ci urshr v3.4h, v1.4h, #3 // the pred4x4 part 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci dup v4.4h, v3.h[0] 702cabdff1aSopenharmony_ci dup v5.4h, v2.h[0] 703cabdff1aSopenharmony_ci dup v6.4h, v2.h[1] 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci zip1 v0.2d, v4.2d, v6.2d 706cabdff1aSopenharmony_ci zip1 v1.2d, v5.2d, v6.2d 707cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 708cabdff1aSopenharmony_ciendfunc 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_cifunction ff_pred8x8_l00_dc_neon_10, export=1 711cabdff1aSopenharmony_ci sub x2, x0, #2 712cabdff1aSopenharmony_ci 713cabdff1aSopenharmony_ci ldcol.16 v0, x2, x1, 4 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 716cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 717cabdff1aSopenharmony_ci urshr v0.4h, v0.4h, #2 718cabdff1aSopenharmony_ci 719cabdff1aSopenharmony_ci movi v1.8h, #2, lsl #8 // 512 720cabdff1aSopenharmony_ci dup v0.8h, v0.h[0] 721cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 722cabdff1aSopenharmony_ciendfunc 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_cifunction ff_pred8x8_0lt_dc_neon_10, export=1 725cabdff1aSopenharmony_ci add x3, x0, x1, lsl #2 726cabdff1aSopenharmony_ci sub x2, x0, x1 727cabdff1aSopenharmony_ci sub x3, x3, #2 728cabdff1aSopenharmony_ci 729cabdff1aSopenharmony_ci ld1 {v0.8h}, [x2] 730cabdff1aSopenharmony_ci ldcol.16 v1, x3, x1, hi=1 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci addp v0.8h, v0.8h, v0.8h 733cabdff1aSopenharmony_ci addp v1.8h, v1.8h, v1.8h 734cabdff1aSopenharmony_ci addp v0.4h, v0.4h, v0.4h 735cabdff1aSopenharmony_ci addp v1.4h, v1.4h, v1.4h 736cabdff1aSopenharmony_ci zip1 v0.2s, v0.2s, v1.2s 737cabdff1aSopenharmony_ci add v1.4h, v0.4h, v1.4h 738cabdff1aSopenharmony_ci 739cabdff1aSopenharmony_ci urshr v2.4h, v0.4h, #2 740cabdff1aSopenharmony_ci urshr v3.4h, v1.4h, #3 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_ci dup v4.4h, v2.h[0] 743cabdff1aSopenharmony_ci dup v5.4h, v2.h[3] 744cabdff1aSopenharmony_ci dup v6.4h, v2.h[1] 745cabdff1aSopenharmony_ci dup v7.4h, v3.h[1] 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_ci zip1 v0.2d, v4.2d, v6.2d 748cabdff1aSopenharmony_ci zip1 v1.2d, v5.2d, v7.2d 749cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 750cabdff1aSopenharmony_ciendfunc 751cabdff1aSopenharmony_ci 752cabdff1aSopenharmony_cifunction ff_pred8x8_0l0_dc_neon_10, export=1 753cabdff1aSopenharmony_ci add x2, x0, x1, lsl #2 754cabdff1aSopenharmony_ci sub x2, x2, #2 755cabdff1aSopenharmony_ci 756cabdff1aSopenharmony_ci ldcol.16 v1, x2, x1, 4 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci addp v2.8h, v1.8h, v1.8h 759cabdff1aSopenharmony_ci addp v2.4h, v2.4h, v2.4h 760cabdff1aSopenharmony_ci urshr v1.4h, v2.4h, #2 761cabdff1aSopenharmony_ci 762cabdff1aSopenharmony_ci movi v0.8h, #2, lsl #8 // 512 763cabdff1aSopenharmony_ci dup v1.8h, v1.h[0] 764cabdff1aSopenharmony_ci b .L_pred8x8_dc_10_end 765cabdff1aSopenharmony_ciendfunc 766