1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23 #include "libavutil/aarch64/asm.S" 24 #include "neon.S" 25 26 .macro h264_loop_filter_start 27 cmp w2, #0 28 ldr w6, [x4] 29 ccmp w3, #0, #0, ne 30 mov v24.S[0], w6 31 and w8, w6, w6, lsl #16 32 b.eq 1f 33 ands w8, w8, w8, lsl #8 34 b.ge 2f 35 1: 36 ret 37 2: 38 .endm 39 40 .macro h264_loop_filter_luma 41 dup v22.16B, w2 // alpha 42 uxtl v24.8H, v24.8B 43 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) 44 uxtl v24.4S, v24.4H 45 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) 46 sli v24.8H, v24.8H, #8 47 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) 48 sli v24.4S, v24.4S, #16 49 cmhi v21.16B, v22.16B, v21.16B // < alpha 50 dup v22.16B, w3 // beta 51 cmlt v23.16B, v24.16B, #0 52 cmhi v28.16B, v22.16B, v28.16B // < beta 53 cmhi v30.16B, v22.16B, v30.16B // < beta 54 bic v21.16B, v21.16B, v23.16B 55 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) 56 and v21.16B, v21.16B, v28.16B 57 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) 58 and v21.16B, v21.16B, v30.16B // < beta 59 shrn v30.8b, v21.8h, #4 60 mov x7, v30.d[0] 61 cmhi v17.16B, v22.16B, v17.16B // < beta 62 cmhi v19.16B, v22.16B, v19.16B // < beta 63 cbz x7, 9f 64 and v17.16B, v17.16B, v21.16B 65 and v19.16B, v19.16B, v21.16B 66 and v24.16B, v24.16B, v21.16B 67 urhadd v28.16B, v16.16B, v0.16B 68 sub v21.16B, v24.16B, v17.16B 69 uqadd v23.16B, v18.16B, v24.16B 70 uhadd v20.16B, v20.16B, v28.16B 71 sub v21.16B, v21.16B, v19.16B 72 uhadd v28.16B, v4.16B, v28.16B 73 umin v23.16B, v23.16B, v20.16B 74 uqsub v22.16B, v18.16B, v24.16B 75 uqadd v4.16B, v2.16B, v24.16B 76 umax v23.16B, v23.16B, v22.16B 77 uqsub v22.16B, v2.16B, v24.16B 78 umin v28.16B, v4.16B, v28.16B 79 uxtl v4.8H, v0.8B 80 umax v28.16B, v28.16B, v22.16B 81 uxtl2 v20.8H, v0.16B 82 usubw v4.8H, v4.8H, v16.8B 83 usubw2 v20.8H, v20.8H, v16.16B 84 shl v4.8H, v4.8H, #2 85 shl v20.8H, v20.8H, #2 86 uaddw v4.8H, v4.8H, v18.8B 87 uaddw2 v20.8H, v20.8H, v18.16B 88 usubw v4.8H, v4.8H, v2.8B 89 usubw2 v20.8H, v20.8H, v2.16B 90 rshrn v4.8B, v4.8H, #3 91 rshrn2 v4.16B, v20.8H, #3 92 bsl v17.16B, v23.16B, v18.16B 93 bsl v19.16B, v28.16B, v2.16B 94 neg v23.16B, v21.16B 95 uxtl v28.8H, v16.8B 96 smin v4.16B, v4.16B, v21.16B 97 uxtl2 v21.8H, v16.16B 98 smax v4.16B, v4.16B, v23.16B 99 uxtl v22.8H, v0.8B 100 uxtl2 v24.8H, v0.16B 101 saddw v28.8H, v28.8H, v4.8B 102 saddw2 v21.8H, v21.8H, v4.16B 103 ssubw v22.8H, v22.8H, v4.8B 104 ssubw2 v24.8H, v24.8H, v4.16B 105 sqxtun v16.8B, v28.8H 106 sqxtun2 v16.16B, v21.8H 107 sqxtun v0.8B, v22.8H 108 sqxtun2 v0.16B, v24.8H 109 .endm 110 111 function ff_h264_v_loop_filter_luma_neon, export=1 112 h264_loop_filter_start 113 114 ld1 {v0.16B}, [x0], x1 115 ld1 {v2.16B}, [x0], x1 116 ld1 {v4.16B}, [x0], x1 117 sub x0, x0, x1, lsl #2 118 sub x0, x0, x1, lsl #1 119 ld1 {v20.16B}, [x0], x1 120 ld1 {v18.16B}, [x0], x1 121 ld1 {v16.16B}, [x0], x1 122 123 h264_loop_filter_luma 124 125 sub x0, x0, x1, lsl #1 126 st1 {v17.16B}, [x0], x1 127 st1 {v16.16B}, [x0], x1 128 st1 {v0.16B}, [x0], x1 129 st1 {v19.16B}, [x0] 130 9: 131 ret 132 endfunc 133 134 function ff_h264_h_loop_filter_luma_neon, export=1 135 h264_loop_filter_start 136 137 sub x0, x0, #4 138 ld1 {v6.8B}, [x0], x1 139 ld1 {v20.8B}, [x0], x1 140 ld1 {v18.8B}, [x0], x1 141 ld1 {v16.8B}, [x0], x1 142 ld1 {v0.8B}, [x0], x1 143 ld1 {v2.8B}, [x0], x1 144 ld1 {v4.8B}, [x0], x1 145 ld1 {v26.8B}, [x0], x1 146 ld1 {v6.D}[1], [x0], x1 147 ld1 {v20.D}[1], [x0], x1 148 ld1 {v18.D}[1], [x0], x1 149 ld1 {v16.D}[1], [x0], x1 150 ld1 {v0.D}[1], [x0], x1 151 ld1 {v2.D}[1], [x0], x1 152 ld1 {v4.D}[1], [x0], x1 153 ld1 {v26.D}[1], [x0], x1 154 155 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 156 157 h264_loop_filter_luma 158 159 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 160 161 sub x0, x0, x1, lsl #4 162 add x0, x0, #2 163 st1 {v17.S}[0], [x0], x1 164 st1 {v16.S}[0], [x0], x1 165 st1 {v0.S}[0], [x0], x1 166 st1 {v19.S}[0], [x0], x1 167 st1 {v17.S}[1], [x0], x1 168 st1 {v16.S}[1], [x0], x1 169 st1 {v0.S}[1], [x0], x1 170 st1 {v19.S}[1], [x0], x1 171 st1 {v17.S}[2], [x0], x1 172 st1 {v16.S}[2], [x0], x1 173 st1 {v0.S}[2], [x0], x1 174 st1 {v19.S}[2], [x0], x1 175 st1 {v17.S}[3], [x0], x1 176 st1 {v16.S}[3], [x0], x1 177 st1 {v0.S}[3], [x0], x1 178 st1 {v19.S}[3], [x0], x1 179 9: 180 ret 181 endfunc 182 183 184 .macro h264_loop_filter_start_intra 185 orr w4, w2, w3 186 cbnz w4, 1f 187 ret 188 1: 189 dup v30.16b, w2 // alpha 190 dup v31.16b, w3 // beta 191 .endm 192 193 .macro h264_loop_filter_luma_intra 194 uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) 195 uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) 196 uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) 197 cmhi v19.16b, v30.16b, v16.16b // < alpha 198 cmhi v17.16b, v31.16b, v17.16b // < beta 199 cmhi v18.16b, v31.16b, v18.16b // < beta 200 201 movi v29.16b, #2 202 ushr v30.16b, v30.16b, #2 // alpha >> 2 203 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 204 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 205 206 and v19.16b, v19.16b, v17.16b 207 and v19.16b, v19.16b, v18.16b 208 shrn v20.8b, v19.8h, #4 209 mov x4, v20.d[0] 210 cbz x4, 9f 211 212 ushll v20.8h, v6.8b, #1 213 ushll v22.8h, v1.8b, #1 214 ushll2 v21.8h, v6.16b, #1 215 ushll2 v23.8h, v1.16b, #1 216 uaddw v20.8h, v20.8h, v7.8b 217 uaddw v22.8h, v22.8h, v0.8b 218 uaddw2 v21.8h, v21.8h, v7.16b 219 uaddw2 v23.8h, v23.8h, v0.16b 220 uaddw v20.8h, v20.8h, v1.8b 221 uaddw v22.8h, v22.8h, v6.8b 222 uaddw2 v21.8h, v21.8h, v1.16b 223 uaddw2 v23.8h, v23.8h, v6.16b 224 225 rshrn v24.8b, v20.8h, #2 // p0'_1 226 rshrn v25.8b, v22.8h, #2 // q0'_1 227 rshrn2 v24.16b, v21.8h, #2 // p0'_1 228 rshrn2 v25.16b, v23.8h, #2 // q0'_1 229 230 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) 231 uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) 232 cmhi v17.16b, v31.16b, v17.16b // < beta 233 cmhi v18.16b, v31.16b, v18.16b // < beta 234 235 and v17.16b, v16.16b, v17.16b // if_2 && if_3 236 and v18.16b, v16.16b, v18.16b // if_2 && if_4 237 238 not v30.16b, v17.16b 239 not v31.16b, v18.16b 240 241 and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) 242 and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) 243 244 and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 245 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 246 247 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 248 uaddl v26.8h, v5.8b, v7.8b 249 uaddl2 v27.8h, v5.16b, v7.16b 250 uaddw v26.8h, v26.8h, v0.8b 251 uaddw2 v27.8h, v27.8h, v0.16b 252 add v20.8h, v20.8h, v26.8h 253 add v21.8h, v21.8h, v27.8h 254 uaddw v20.8h, v20.8h, v0.8b 255 uaddw2 v21.8h, v21.8h, v0.16b 256 rshrn v20.8b, v20.8h, #3 // p0'_2 257 rshrn2 v20.16b, v21.8h, #3 // p0'_2 258 uaddw v26.8h, v26.8h, v6.8b 259 uaddw2 v27.8h, v27.8h, v6.16b 260 rshrn v21.8b, v26.8h, #2 // p1'_2 261 rshrn2 v21.16b, v27.8h, #2 // p1'_2 262 uaddl v28.8h, v4.8b, v5.8b 263 uaddl2 v29.8h, v4.16b, v5.16b 264 shl v28.8h, v28.8h, #1 265 shl v29.8h, v29.8h, #1 266 add v28.8h, v28.8h, v26.8h 267 add v29.8h, v29.8h, v27.8h 268 rshrn v19.8b, v28.8h, #3 // p2'_2 269 rshrn2 v19.16b, v29.8h, #3 // p2'_2 270 271 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 272 uaddl v26.8h, v2.8b, v0.8b 273 uaddl2 v27.8h, v2.16b, v0.16b 274 uaddw v26.8h, v26.8h, v7.8b 275 uaddw2 v27.8h, v27.8h, v7.16b 276 add v22.8h, v22.8h, v26.8h 277 add v23.8h, v23.8h, v27.8h 278 uaddw v22.8h, v22.8h, v7.8b 279 uaddw2 v23.8h, v23.8h, v7.16b 280 rshrn v22.8b, v22.8h, #3 // q0'_2 281 rshrn2 v22.16b, v23.8h, #3 // q0'_2 282 uaddw v26.8h, v26.8h, v1.8b 283 uaddw2 v27.8h, v27.8h, v1.16b 284 rshrn v23.8b, v26.8h, #2 // q1'_2 285 rshrn2 v23.16b, v27.8h, #2 // q1'_2 286 uaddl v28.8h, v2.8b, v3.8b 287 uaddl2 v29.8h, v2.16b, v3.16b 288 shl v28.8h, v28.8h, #1 289 shl v29.8h, v29.8h, #1 290 add v28.8h, v28.8h, v26.8h 291 add v29.8h, v29.8h, v27.8h 292 rshrn v26.8b, v28.8h, #3 // q2'_2 293 rshrn2 v26.16b, v29.8h, #3 // q2'_2 294 295 bit v7.16b, v24.16b, v30.16b // p0'_1 296 bit v0.16b, v25.16b, v31.16b // q0'_1 297 bit v7.16b, v20.16b, v17.16b // p0'_2 298 bit v6.16b, v21.16b, v17.16b // p1'_2 299 bit v5.16b, v19.16b, v17.16b // p2'_2 300 bit v0.16b, v22.16b, v18.16b // q0'_2 301 bit v1.16b, v23.16b, v18.16b // q1'_2 302 bit v2.16b, v26.16b, v18.16b // q2'_2 303 .endm 304 305 function ff_h264_v_loop_filter_luma_intra_neon, export=1 306 h264_loop_filter_start_intra 307 308 ld1 {v0.16b}, [x0], x1 // q0 309 ld1 {v1.16b}, [x0], x1 // q1 310 ld1 {v2.16b}, [x0], x1 // q2 311 ld1 {v3.16b}, [x0], x1 // q3 312 sub x0, x0, x1, lsl #3 313 ld1 {v4.16b}, [x0], x1 // p3 314 ld1 {v5.16b}, [x0], x1 // p2 315 ld1 {v6.16b}, [x0], x1 // p1 316 ld1 {v7.16b}, [x0] // p0 317 318 h264_loop_filter_luma_intra 319 320 sub x0, x0, x1, lsl #1 321 st1 {v5.16b}, [x0], x1 // p2 322 st1 {v6.16b}, [x0], x1 // p1 323 st1 {v7.16b}, [x0], x1 // p0 324 st1 {v0.16b}, [x0], x1 // q0 325 st1 {v1.16b}, [x0], x1 // q1 326 st1 {v2.16b}, [x0] // q2 327 9: 328 ret 329 endfunc 330 331 function ff_h264_h_loop_filter_luma_intra_neon, export=1 332 h264_loop_filter_start_intra 333 334 sub x0, x0, #4 335 ld1 {v4.8b}, [x0], x1 336 ld1 {v5.8b}, [x0], x1 337 ld1 {v6.8b}, [x0], x1 338 ld1 {v7.8b}, [x0], x1 339 ld1 {v0.8b}, [x0], x1 340 ld1 {v1.8b}, [x0], x1 341 ld1 {v2.8b}, [x0], x1 342 ld1 {v3.8b}, [x0], x1 343 ld1 {v4.d}[1], [x0], x1 344 ld1 {v5.d}[1], [x0], x1 345 ld1 {v6.d}[1], [x0], x1 346 ld1 {v7.d}[1], [x0], x1 347 ld1 {v0.d}[1], [x0], x1 348 ld1 {v1.d}[1], [x0], x1 349 ld1 {v2.d}[1], [x0], x1 350 ld1 {v3.d}[1], [x0], x1 351 352 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 353 354 h264_loop_filter_luma_intra 355 356 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 357 358 sub x0, x0, x1, lsl #4 359 st1 {v4.8b}, [x0], x1 360 st1 {v5.8b}, [x0], x1 361 st1 {v6.8b}, [x0], x1 362 st1 {v7.8b}, [x0], x1 363 st1 {v0.8b}, [x0], x1 364 st1 {v1.8b}, [x0], x1 365 st1 {v2.8b}, [x0], x1 366 st1 {v3.8b}, [x0], x1 367 st1 {v4.d}[1], [x0], x1 368 st1 {v5.d}[1], [x0], x1 369 st1 {v6.d}[1], [x0], x1 370 st1 {v7.d}[1], [x0], x1 371 st1 {v0.d}[1], [x0], x1 372 st1 {v1.d}[1], [x0], x1 373 st1 {v2.d}[1], [x0], x1 374 st1 {v3.d}[1], [x0], x1 375 9: 376 ret 377 endfunc 378 379 .macro h264_loop_filter_chroma 380 dup v22.8B, w2 // alpha 381 dup v23.8B, w3 // beta 382 uxtl v24.8H, v24.8B 383 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) 384 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) 385 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) 386 cmhi v26.8B, v22.8B, v26.8B // < alpha 387 cmhi v28.8B, v23.8B, v28.8B // < beta 388 cmhi v30.8B, v23.8B, v30.8B // < beta 389 uxtl v4.8H, v0.8B 390 and v26.8B, v26.8B, v28.8B 391 usubw v4.8H, v4.8H, v16.8B 392 and v26.8B, v26.8B, v30.8B 393 shl v4.8H, v4.8H, #2 394 mov x8, v26.d[0] 395 sli v24.8H, v24.8H, #8 396 uaddw v4.8H, v4.8H, v18.8B 397 cbz x8, 9f 398 usubw v4.8H, v4.8H, v2.8B 399 rshrn v4.8B, v4.8H, #3 400 smin v4.8B, v4.8B, v24.8B 401 neg v25.8B, v24.8B 402 smax v4.8B, v4.8B, v25.8B 403 uxtl v22.8H, v0.8B 404 and v4.8B, v4.8B, v26.8B 405 uxtl v28.8H, v16.8B 406 saddw v28.8H, v28.8H, v4.8B 407 ssubw v22.8H, v22.8H, v4.8B 408 sqxtun v16.8B, v28.8H 409 sqxtun v0.8B, v22.8H 410 .endm 411 412 function ff_h264_v_loop_filter_chroma_neon, export=1 413 h264_loop_filter_start 414 415 sub x0, x0, x1, lsl #1 416 ld1 {v18.8B}, [x0], x1 417 ld1 {v16.8B}, [x0], x1 418 ld1 {v0.8B}, [x0], x1 419 ld1 {v2.8B}, [x0] 420 421 h264_loop_filter_chroma 422 423 sub x0, x0, x1, lsl #1 424 st1 {v16.8B}, [x0], x1 425 st1 {v0.8B}, [x0], x1 426 9: 427 ret 428 endfunc 429 430 function ff_h264_h_loop_filter_chroma_neon, export=1 431 h264_loop_filter_start 432 433 sub x0, x0, #2 434 h_loop_filter_chroma420: 435 ld1 {v18.S}[0], [x0], x1 436 ld1 {v16.S}[0], [x0], x1 437 ld1 {v0.S}[0], [x0], x1 438 ld1 {v2.S}[0], [x0], x1 439 ld1 {v18.S}[1], [x0], x1 440 ld1 {v16.S}[1], [x0], x1 441 ld1 {v0.S}[1], [x0], x1 442 ld1 {v2.S}[1], [x0], x1 443 444 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 445 446 h264_loop_filter_chroma 447 448 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 449 450 sub x0, x0, x1, lsl #3 451 st1 {v18.S}[0], [x0], x1 452 st1 {v16.S}[0], [x0], x1 453 st1 {v0.S}[0], [x0], x1 454 st1 {v2.S}[0], [x0], x1 455 st1 {v18.S}[1], [x0], x1 456 st1 {v16.S}[1], [x0], x1 457 st1 {v0.S}[1], [x0], x1 458 st1 {v2.S}[1], [x0], x1 459 9: 460 ret 461 endfunc 462 463 function ff_h264_h_loop_filter_chroma422_neon, export=1 464 h264_loop_filter_start 465 add x5, x0, x1 466 sub x0, x0, #2 467 add x1, x1, x1 468 mov x7, x30 469 bl h_loop_filter_chroma420 470 mov x30, x7 471 sub x0, x5, #2 472 mov v24.s[0], w6 473 b h_loop_filter_chroma420 474 endfunc 475 476 .macro h264_loop_filter_chroma_intra 477 uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) 478 uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) 479 uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) 480 cmhi v26.8b, v30.8b, v26.8b // < alpha 481 cmhi v27.8b, v31.8b, v27.8b // < beta 482 cmhi v28.8b, v31.8b, v28.8b // < beta 483 and v26.8b, v26.8b, v27.8b 484 and v26.8b, v26.8b, v28.8b 485 mov x2, v26.d[0] 486 487 ushll v4.8h, v18.8b, #1 488 ushll v6.8h, v19.8b, #1 489 cbz x2, 9f 490 uaddl v20.8h, v16.8b, v19.8b 491 uaddl v22.8h, v17.8b, v18.8b 492 add v20.8h, v20.8h, v4.8h 493 add v22.8h, v22.8h, v6.8h 494 uqrshrn v24.8b, v20.8h, #2 495 uqrshrn v25.8b, v22.8h, #2 496 bit v16.8b, v24.8b, v26.8b 497 bit v17.8b, v25.8b, v26.8b 498 .endm 499 500 function ff_h264_v_loop_filter_chroma_intra_neon, export=1 501 h264_loop_filter_start_intra 502 503 sub x0, x0, x1, lsl #1 504 ld1 {v18.8b}, [x0], x1 505 ld1 {v16.8b}, [x0], x1 506 ld1 {v17.8b}, [x0], x1 507 ld1 {v19.8b}, [x0] 508 509 h264_loop_filter_chroma_intra 510 511 sub x0, x0, x1, lsl #1 512 st1 {v16.8b}, [x0], x1 513 st1 {v17.8b}, [x0], x1 514 515 9: 516 ret 517 endfunc 518 519 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 520 h264_loop_filter_start_intra 521 522 sub x4, x0, #2 523 sub x0, x0, #1 524 ld1 {v18.8b}, [x4], x1 525 ld1 {v16.8b}, [x4], x1 526 ld1 {v17.8b}, [x4], x1 527 ld1 {v19.8b}, [x4], x1 528 529 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 530 531 h264_loop_filter_chroma_intra 532 533 st2 {v16.b,v17.b}[0], [x0], x1 534 st2 {v16.b,v17.b}[1], [x0], x1 535 st2 {v16.b,v17.b}[2], [x0], x1 536 st2 {v16.b,v17.b}[3], [x0], x1 537 538 9: 539 ret 540 endfunc 541 542 function ff_h264_h_loop_filter_chroma_intra_neon, export=1 543 h264_loop_filter_start_intra 544 545 sub x4, x0, #2 546 sub x0, x0, #1 547 h_loop_filter_chroma420_intra: 548 ld1 {v18.8b}, [x4], x1 549 ld1 {v16.8b}, [x4], x1 550 ld1 {v17.8b}, [x4], x1 551 ld1 {v19.8b}, [x4], x1 552 ld1 {v18.s}[1], [x4], x1 553 ld1 {v16.s}[1], [x4], x1 554 ld1 {v17.s}[1], [x4], x1 555 ld1 {v19.s}[1], [x4], x1 556 557 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 558 559 h264_loop_filter_chroma_intra 560 561 st2 {v16.b,v17.b}[0], [x0], x1 562 st2 {v16.b,v17.b}[1], [x0], x1 563 st2 {v16.b,v17.b}[2], [x0], x1 564 st2 {v16.b,v17.b}[3], [x0], x1 565 st2 {v16.b,v17.b}[4], [x0], x1 566 st2 {v16.b,v17.b}[5], [x0], x1 567 st2 {v16.b,v17.b}[6], [x0], x1 568 st2 {v16.b,v17.b}[7], [x0], x1 569 570 9: 571 ret 572 endfunc 573 574 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1 575 h264_loop_filter_start_intra 576 sub x4, x0, #2 577 add x5, x0, x1, lsl #3 578 sub x0, x0, #1 579 mov x7, x30 580 bl h_loop_filter_chroma420_intra 581 sub x0, x5, #1 582 mov x30, x7 583 b h_loop_filter_chroma420_intra 584 endfunc 585 586 .macro biweight_16 macs, macd 587 dup v0.16B, w5 588 dup v1.16B, w6 589 mov v4.16B, v16.16B 590 mov v6.16B, v16.16B 591 1: subs w3, w3, #2 592 ld1 {v20.16B}, [x0], x2 593 \macd v4.8H, v0.8B, v20.8B 594 \macd\()2 v6.8H, v0.16B, v20.16B 595 ld1 {v22.16B}, [x1], x2 596 \macs v4.8H, v1.8B, v22.8B 597 \macs\()2 v6.8H, v1.16B, v22.16B 598 mov v24.16B, v16.16B 599 ld1 {v28.16B}, [x0], x2 600 mov v26.16B, v16.16B 601 \macd v24.8H, v0.8B, v28.8B 602 \macd\()2 v26.8H, v0.16B, v28.16B 603 ld1 {v30.16B}, [x1], x2 604 \macs v24.8H, v1.8B, v30.8B 605 \macs\()2 v26.8H, v1.16B, v30.16B 606 sshl v4.8H, v4.8H, v18.8H 607 sshl v6.8H, v6.8H, v18.8H 608 sqxtun v4.8B, v4.8H 609 sqxtun2 v4.16B, v6.8H 610 sshl v24.8H, v24.8H, v18.8H 611 sshl v26.8H, v26.8H, v18.8H 612 sqxtun v24.8B, v24.8H 613 sqxtun2 v24.16B, v26.8H 614 mov v6.16B, v16.16B 615 st1 {v4.16B}, [x7], x2 616 mov v4.16B, v16.16B 617 st1 {v24.16B}, [x7], x2 618 b.ne 1b 619 ret 620 .endm 621 622 .macro biweight_8 macs, macd 623 dup v0.8B, w5 624 dup v1.8B, w6 625 mov v2.16B, v16.16B 626 mov v20.16B, v16.16B 627 1: subs w3, w3, #2 628 ld1 {v4.8B}, [x0], x2 629 \macd v2.8H, v0.8B, v4.8B 630 ld1 {v5.8B}, [x1], x2 631 \macs v2.8H, v1.8B, v5.8B 632 ld1 {v6.8B}, [x0], x2 633 \macd v20.8H, v0.8B, v6.8B 634 ld1 {v7.8B}, [x1], x2 635 \macs v20.8H, v1.8B, v7.8B 636 sshl v2.8H, v2.8H, v18.8H 637 sqxtun v2.8B, v2.8H 638 sshl v20.8H, v20.8H, v18.8H 639 sqxtun v4.8B, v20.8H 640 mov v20.16B, v16.16B 641 st1 {v2.8B}, [x7], x2 642 mov v2.16B, v16.16B 643 st1 {v4.8B}, [x7], x2 644 b.ne 1b 645 ret 646 .endm 647 648 .macro biweight_4 macs, macd 649 dup v0.8B, w5 650 dup v1.8B, w6 651 mov v2.16B, v16.16B 652 mov v20.16B,v16.16B 653 1: subs w3, w3, #4 654 ld1 {v4.S}[0], [x0], x2 655 ld1 {v4.S}[1], [x0], x2 656 \macd v2.8H, v0.8B, v4.8B 657 ld1 {v5.S}[0], [x1], x2 658 ld1 {v5.S}[1], [x1], x2 659 \macs v2.8H, v1.8B, v5.8B 660 b.lt 2f 661 ld1 {v6.S}[0], [x0], x2 662 ld1 {v6.S}[1], [x0], x2 663 \macd v20.8H, v0.8B, v6.8B 664 ld1 {v7.S}[0], [x1], x2 665 ld1 {v7.S}[1], [x1], x2 666 \macs v20.8H, v1.8B, v7.8B 667 sshl v2.8H, v2.8H, v18.8H 668 sqxtun v2.8B, v2.8H 669 sshl v20.8H, v20.8H, v18.8H 670 sqxtun v4.8B, v20.8H 671 mov v20.16B, v16.16B 672 st1 {v2.S}[0], [x7], x2 673 st1 {v2.S}[1], [x7], x2 674 mov v2.16B, v16.16B 675 st1 {v4.S}[0], [x7], x2 676 st1 {v4.S}[1], [x7], x2 677 b.ne 1b 678 ret 679 2: sshl v2.8H, v2.8H, v18.8H 680 sqxtun v2.8B, v2.8H 681 st1 {v2.S}[0], [x7], x2 682 st1 {v2.S}[1], [x7], x2 683 ret 684 .endm 685 686 .macro biweight_func w 687 function ff_biweight_h264_pixels_\w\()_neon, export=1 688 lsr w8, w5, #31 689 add w7, w7, #1 690 eor w8, w8, w6, lsr #30 691 orr w7, w7, #1 692 dup v18.8H, w4 693 lsl w7, w7, w4 694 not v18.16B, v18.16B 695 dup v16.8H, w7 696 mov x7, x0 697 cbz w8, 10f 698 subs w8, w8, #1 699 b.eq 20f 700 subs w8, w8, #1 701 b.eq 30f 702 b 40f 703 10: biweight_\w umlal, umlal 704 20: neg w5, w5 705 biweight_\w umlal, umlsl 706 30: neg w5, w5 707 neg w6, w6 708 biweight_\w umlsl, umlsl 709 40: neg w6, w6 710 biweight_\w umlsl, umlal 711 endfunc 712 .endm 713 714 biweight_func 16 715 biweight_func 8 716 biweight_func 4 717 718 .macro weight_16 add 719 dup v0.16B, w4 720 1: subs w2, w2, #2 721 ld1 {v20.16B}, [x0], x1 722 umull v4.8H, v0.8B, v20.8B 723 umull2 v6.8H, v0.16B, v20.16B 724 ld1 {v28.16B}, [x0], x1 725 umull v24.8H, v0.8B, v28.8B 726 umull2 v26.8H, v0.16B, v28.16B 727 \add v4.8H, v16.8H, v4.8H 728 srshl v4.8H, v4.8H, v18.8H 729 \add v6.8H, v16.8H, v6.8H 730 srshl v6.8H, v6.8H, v18.8H 731 sqxtun v4.8B, v4.8H 732 sqxtun2 v4.16B, v6.8H 733 \add v24.8H, v16.8H, v24.8H 734 srshl v24.8H, v24.8H, v18.8H 735 \add v26.8H, v16.8H, v26.8H 736 srshl v26.8H, v26.8H, v18.8H 737 sqxtun v24.8B, v24.8H 738 sqxtun2 v24.16B, v26.8H 739 st1 {v4.16B}, [x5], x1 740 st1 {v24.16B}, [x5], x1 741 b.ne 1b 742 ret 743 .endm 744 745 .macro weight_8 add 746 dup v0.8B, w4 747 1: subs w2, w2, #2 748 ld1 {v4.8B}, [x0], x1 749 umull v2.8H, v0.8B, v4.8B 750 ld1 {v6.8B}, [x0], x1 751 umull v20.8H, v0.8B, v6.8B 752 \add v2.8H, v16.8H, v2.8H 753 srshl v2.8H, v2.8H, v18.8H 754 sqxtun v2.8B, v2.8H 755 \add v20.8H, v16.8H, v20.8H 756 srshl v20.8H, v20.8H, v18.8H 757 sqxtun v4.8B, v20.8H 758 st1 {v2.8B}, [x5], x1 759 st1 {v4.8B}, [x5], x1 760 b.ne 1b 761 ret 762 .endm 763 764 .macro weight_4 add 765 dup v0.8B, w4 766 1: subs w2, w2, #4 767 ld1 {v4.S}[0], [x0], x1 768 ld1 {v4.S}[1], [x0], x1 769 umull v2.8H, v0.8B, v4.8B 770 b.lt 2f 771 ld1 {v6.S}[0], [x0], x1 772 ld1 {v6.S}[1], [x0], x1 773 umull v20.8H, v0.8B, v6.8B 774 \add v2.8H, v16.8H, v2.8H 775 srshl v2.8H, v2.8H, v18.8H 776 sqxtun v2.8B, v2.8H 777 \add v20.8H, v16.8H, v20.8H 778 srshl v20.8H, v20.8h, v18.8H 779 sqxtun v4.8B, v20.8H 780 st1 {v2.S}[0], [x5], x1 781 st1 {v2.S}[1], [x5], x1 782 st1 {v4.S}[0], [x5], x1 783 st1 {v4.S}[1], [x5], x1 784 b.ne 1b 785 ret 786 2: \add v2.8H, v16.8H, v2.8H 787 srshl v2.8H, v2.8H, v18.8H 788 sqxtun v2.8B, v2.8H 789 st1 {v2.S}[0], [x5], x1 790 st1 {v2.S}[1], [x5], x1 791 ret 792 .endm 793 794 .macro weight_func w 795 function ff_weight_h264_pixels_\w\()_neon, export=1 796 cmp w3, #1 797 mov w6, #1 798 lsl w5, w5, w3 799 dup v16.8H, w5 800 mov x5, x0 801 b.le 20f 802 sub w6, w6, w3 803 dup v18.8H, w6 804 cmp w4, #0 805 b.lt 10f 806 weight_\w shadd 807 10: neg w4, w4 808 weight_\w shsub 809 20: neg w6, w3 810 dup v18.8H, w6 811 cmp w4, #0 812 b.lt 10f 813 weight_\w add 814 10: neg w4, w4 815 weight_\w sub 816 endfunc 817 .endm 818 819 weight_func 16 820 weight_func 8 821 weight_func 4 822 823 .macro h264_loop_filter_start_10 824 cmp w2, #0 825 ldr w6, [x4] 826 ccmp w3, #0, #0, ne 827 lsl w2, w2, #2 828 mov v24.S[0], w6 829 lsl w3, w3, #2 830 and w8, w6, w6, lsl #16 831 b.eq 1f 832 ands w8, w8, w8, lsl #8 833 b.ge 2f 834 1: 835 ret 836 2: 837 .endm 838 839 .macro h264_loop_filter_start_intra_10 840 orr w4, w2, w3 841 cbnz w4, 1f 842 ret 843 1: 844 lsl w2, w2, #2 845 lsl w3, w3, #2 846 dup v30.8h, w2 // alpha 847 dup v31.8h, w3 // beta 848 .endm 849 850 .macro h264_loop_filter_chroma_10 851 dup v22.8h, w2 // alpha 852 dup v23.8h, w3 // beta 853 uxtl v24.8h, v24.8b // tc0 854 855 uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) 856 uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) 857 uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) 858 cmhi v26.8h, v22.8h, v26.8h // < alpha 859 cmhi v28.8h, v23.8h, v28.8h // < beta 860 cmhi v30.8h, v23.8h, v30.8h // < beta 861 862 and v26.16b, v26.16b, v28.16b 863 mov v4.16b, v0.16b 864 sub v4.8h, v4.8h, v16.8h 865 and v26.16b, v26.16b, v30.16b 866 shl v4.8h, v4.8h, #2 867 mov x8, v26.d[0] 868 mov x9, v26.d[1] 869 sli v24.8h, v24.8h, #8 870 uxtl v24.8h, v24.8b 871 add v4.8h, v4.8h, v18.8h 872 adds x8, x8, x9 873 shl v24.8h, v24.8h, #2 874 875 b.eq 9f 876 877 movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1 878 uqsub v24.8h, v24.8h, v31.8h 879 sub v4.8h, v4.8h, v2.8h 880 srshr v4.8h, v4.8h, #3 881 smin v4.8h, v4.8h, v24.8h 882 neg v25.8h, v24.8h 883 smax v4.8h, v4.8h, v25.8h 884 and v4.16b, v4.16b, v26.16b 885 add v16.8h, v16.8h, v4.8h 886 sub v0.8h, v0.8h, v4.8h 887 888 mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 889 movi v5.8h, #0 890 smin v0.8h, v0.8h, v4.8h 891 smin v16.8h, v16.8h, v4.8h 892 smax v0.8h, v0.8h, v5.8h 893 smax v16.8h, v16.8h, v5.8h 894 .endm 895 896 function ff_h264_v_loop_filter_chroma_neon_10, export=1 897 h264_loop_filter_start_10 898 899 mov x10, x0 900 sub x0, x0, x1, lsl #1 901 ld1 {v18.8h}, [x0 ], x1 902 ld1 {v0.8h}, [x10], x1 903 ld1 {v16.8h}, [x0 ], x1 904 ld1 {v2.8h}, [x10] 905 906 h264_loop_filter_chroma_10 907 908 sub x0, x10, x1, lsl #1 909 st1 {v16.8h}, [x0], x1 910 st1 {v0.8h}, [x0], x1 911 9: 912 ret 913 endfunc 914 915 function ff_h264_h_loop_filter_chroma_neon_10, export=1 916 h264_loop_filter_start_10 917 918 sub x0, x0, #4 // access the 2nd left pixel 919 h_loop_filter_chroma420_10: 920 add x10, x0, x1, lsl #2 921 ld1 {v18.d}[0], [x0 ], x1 922 ld1 {v18.d}[1], [x10], x1 923 ld1 {v16.d}[0], [x0 ], x1 924 ld1 {v16.d}[1], [x10], x1 925 ld1 {v0.d}[0], [x0 ], x1 926 ld1 {v0.d}[1], [x10], x1 927 ld1 {v2.d}[0], [x0 ], x1 928 ld1 {v2.d}[1], [x10], x1 929 930 transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 931 932 h264_loop_filter_chroma_10 933 934 transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 935 936 sub x0, x10, x1, lsl #3 937 st1 {v18.d}[0], [x0], x1 938 st1 {v16.d}[0], [x0], x1 939 st1 {v0.d}[0], [x0], x1 940 st1 {v2.d}[0], [x0], x1 941 st1 {v18.d}[1], [x0], x1 942 st1 {v16.d}[1], [x0], x1 943 st1 {v0.d}[1], [x0], x1 944 st1 {v2.d}[1], [x0], x1 945 9: 946 ret 947 endfunc 948 949 function ff_h264_h_loop_filter_chroma422_neon_10, export=1 950 h264_loop_filter_start_10 951 add x5, x0, x1 952 sub x0, x0, #4 953 add x1, x1, x1 954 mov x7, x30 955 bl h_loop_filter_chroma420_10 956 mov x30, x7 957 sub x0, x5, #4 958 mov v24.s[0], w6 959 b h_loop_filter_chroma420_10 960 endfunc 961 962 .macro h264_loop_filter_chroma_intra_10 963 uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) 964 uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) 965 uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) 966 cmhi v26.8h, v30.8h, v26.8h // < alpha 967 cmhi v27.8h, v31.8h, v27.8h // < beta 968 cmhi v28.8h, v31.8h, v28.8h // < beta 969 and v26.16b, v26.16b, v27.16b 970 and v26.16b, v26.16b, v28.16b 971 mov x2, v26.d[0] 972 mov x3, v26.d[1] 973 974 shl v4.8h, v18.8h, #1 975 shl v6.8h, v19.8h, #1 976 977 adds x2, x2, x3 978 b.eq 9f 979 980 add v20.8h, v16.8h, v19.8h 981 add v22.8h, v17.8h, v18.8h 982 add v20.8h, v20.8h, v4.8h 983 add v22.8h, v22.8h, v6.8h 984 urshr v24.8h, v20.8h, #2 985 urshr v25.8h, v22.8h, #2 986 bit v16.16b, v24.16b, v26.16b 987 bit v17.16b, v25.16b, v26.16b 988 .endm 989 990 function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 991 h264_loop_filter_start_intra_10 992 mov x9, x0 993 sub x0, x0, x1, lsl #1 994 ld1 {v18.8h}, [x0], x1 995 ld1 {v17.8h}, [x9], x1 996 ld1 {v16.8h}, [x0], x1 997 ld1 {v19.8h}, [x9] 998 999 h264_loop_filter_chroma_intra_10 1000 1001 sub x0, x9, x1, lsl #1 1002 st1 {v16.8h}, [x0], x1 1003 st1 {v17.8h}, [x0], x1 1004 1005 9: 1006 ret 1007 endfunc 1008 1009 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 1010 h264_loop_filter_start_intra_10 1011 1012 sub x4, x0, #4 1013 sub x0, x0, #2 1014 add x9, x4, x1, lsl #1 1015 ld1 {v18.8h}, [x4], x1 1016 ld1 {v17.8h}, [x9], x1 1017 ld1 {v16.8h}, [x4], x1 1018 ld1 {v19.8h}, [x9], x1 1019 1020 transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 1021 1022 h264_loop_filter_chroma_intra_10 1023 1024 st2 {v16.h,v17.h}[0], [x0], x1 1025 st2 {v16.h,v17.h}[1], [x0], x1 1026 st2 {v16.h,v17.h}[2], [x0], x1 1027 st2 {v16.h,v17.h}[3], [x0], x1 1028 1029 9: 1030 ret 1031 endfunc 1032 1033 function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 1034 h264_loop_filter_start_intra_10 1035 sub x4, x0, #4 1036 sub x0, x0, #2 1037 h_loop_filter_chroma420_intra_10: 1038 add x9, x4, x1, lsl #2 1039 ld1 {v18.4h}, [x4], x1 1040 ld1 {v18.d}[1], [x9], x1 1041 ld1 {v16.4h}, [x4], x1 1042 ld1 {v16.d}[1], [x9], x1 1043 ld1 {v17.4h}, [x4], x1 1044 ld1 {v17.d}[1], [x9], x1 1045 ld1 {v19.4h}, [x4], x1 1046 ld1 {v19.d}[1], [x9], x1 1047 1048 transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 1049 1050 h264_loop_filter_chroma_intra_10 1051 1052 st2 {v16.h,v17.h}[0], [x0], x1 1053 st2 {v16.h,v17.h}[1], [x0], x1 1054 st2 {v16.h,v17.h}[2], [x0], x1 1055 st2 {v16.h,v17.h}[3], [x0], x1 1056 st2 {v16.h,v17.h}[4], [x0], x1 1057 st2 {v16.h,v17.h}[5], [x0], x1 1058 st2 {v16.h,v17.h}[6], [x0], x1 1059 st2 {v16.h,v17.h}[7], [x0], x1 1060 1061 9: 1062 ret 1063 endfunc 1064 1065 function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1 1066 h264_loop_filter_start_intra_10 1067 sub x4, x0, #4 1068 add x5, x0, x1, lsl #3 1069 sub x0, x0, #2 1070 mov x7, x30 1071 bl h_loop_filter_chroma420_intra_10 1072 mov x4, x9 1073 sub x0, x5, #2 1074 mov x30, x7 1075 b h_loop_filter_chroma420_intra_10 1076 endfunc 1077